import urllib.robotparser
url_prefix = "localhost:8888" # 根据当前地址栏的端口设置端口,jupyter默认为8888
rp = urllib.robotparser.RobotFileParser()
robots_url = "http://%s/view/robots.txt" % url_prefix
robots_url
'http://localhost:8888/view/robots.txt'
rp.set_url(robots_url)
rp.read()
rp.can_fetch("Baiduspider", "http://%s/" % url_prefix)
True
rp.can_fetch("Baiduspider", "http://%s/article/" % url_prefix)
True
import requests
r = requests.get("http://%s/view/crawlme.html" % url_prefix)
print(r.text)
print(r.encoding)
print(r.url)
print(r.status_code)
<!DOCTYPE HTML> <html> <head> <meta charset="utf-8"> <title>Jupyter Notebook</title> <link id="favicon" rel="shortcut icon" type="image/x-icon" href="/static/base/images/favicon.ico?v=97c6417ed01bdc0ae3ef32ae4894fd03"> <meta http-equiv="X-UA-Compatible" content="IE=edge" /> <link rel="stylesheet" href="/static/components/jquery-ui/themes/smoothness/jquery-ui.min.css?v=3c2a865c832a1322285c55c6ed99abb2" type="text/css" /> <link rel="stylesheet" href="/static/components/jquery-typeahead/dist/jquery.typeahead.min.css?v=7afb461de36accb1aa133a1710f5bc56" type="text/css" /> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <link rel="stylesheet" href="/static/style/style.min.css?v=e91a43337d7c294cc9fab2938fa723b3" type="text/css"/> <link rel="stylesheet" href="/static/auth/css/override.css?v=ae668279c8a80c1d2e81fc28345169ee" type="text/css" /> <link rel="stylesheet" href="/custom/custom.css" type="text/css" /> <script src="/static/components/es6-promise/promise.min.js?v=f004a16cb856e0ff11781d01ec5ca8fe" type="text/javascript" charset="utf-8"></script> <script src="/static/components/react/react.production.min.js?v=34f96ffc962a7deecc83037ccb582b58" type="text/javascript"></script> <script src="/static/components/react/react-dom.production.min.js?v=b14d91fb641317cda38dbc9dbf985ab4" type="text/javascript"></script> <script src="/static/components/create-react-class/index.js?v=94feb9971ce6d26211729abc43f96cd2" type="text/javascript"></script> <script src="/static/components/requirejs/require.js?v=951f856e81496aaeec2e71a1c2c0d51f" type="text/javascript" charset="utf-8"></script> <script> require.config({ urlArgs: "v=20191228195158", baseUrl: '/static/', paths: { 'auth/js/main': 'auth/js/main.min', custom : '/custom', nbextensions : '/nbextensions', kernelspecs : '/kernelspecs', underscore : 'components/underscore/underscore-min', backbone : 'components/backbone/backbone-min', jed: 'components/jed/jed', jquery: 'components/jquery/jquery.min', json: 'components/requirejs-plugins/src/json', text: 'components/requirejs-text/text', bootstrap: 'components/bootstrap/dist/js/bootstrap.min', bootstraptour: 'components/bootstrap-tour/build/js/bootstrap-tour.min', 'jquery-ui': 'components/jquery-ui/jquery-ui.min', moment: 'components/moment/min/moment-with-locales', codemirror: 'components/codemirror', termjs: 'components/xterm.js/xterm', typeahead: 'components/jquery-typeahead/dist/jquery.typeahead.min', }, map: { // for backward compatibility "*": { "jqueryui": "jquery-ui", } }, shim: { typeahead: { deps: ["jquery"], exports: "typeahead" }, underscore: { exports: '_' }, backbone: { deps: ["underscore", "jquery"], exports: "Backbone" }, bootstrap: { deps: ["jquery"], exports: "bootstrap" }, bootstraptour: { deps: ["bootstrap"], exports: "Tour" }, "jquery-ui": { deps: ["jquery"], exports: "$" } }, waitSeconds: 30, }); require.config({ map: { '*':{ 'contents': 'services/contents', } } }); // error-catching custom.js shim. define("custom", function (require, exports, module) { try { var custom = require('custom/custom'); console.debug('loaded custom.js'); return custom; } catch (e) { console.error("error loading custom.js", e); return {}; } }) document.nbjs_translations = {"domain": "nbjs", "locale_data": {"nbjs": {"": {"domain": "nbjs"}}}}; document.documentElement.lang = navigator.language.toLowerCase(); </script> </head> <body class="" dir="ltr"> <noscript> <div id='noscript'> Jupyter Notebook需要的JavaScript.<br> 请允许它继续. </div> </noscript> <div id="header" role="navigation" aria-label="Top Menu"> <div id="header-container" class="container"> <div id="ipython_notebook" class="nav navbar-brand"><a href="/tree" title='指示板'> <img src='/static/base/images/logo.png?v=641991992878ee24c6f3826e81054a0f' alt='Jupyter Notebook'/> </a></div> </div> <div class="header-bar"></div> </div> <div id="site"> <div id="ipython-main-app" class="container"> <div class="row"> <div class="navbar col-sm-8"> <div class="navbar-inner"> <div class="container"> <div class="center-nav"> <form action="/login?next=%2Fview%2Fcrawlme.html" method="post" class="navbar-form pull-left"> <input type="hidden" name="_xsrf" value="2|ddfea313|117b8c4dda745abc7f40ce62064e53c7|1577542029"/> <label for="password_input"><strong>密码或者token:</strong></label> <input type="password" name="password" id="password_input" class="form-control"> <button type="submit" class="btn btn-default" id="login_submit">登录</button> </form> </div> </div> </div> </div> </div> <div class="col-sm-6 col-sm-offset-3 text-left rendered_html"> <h3> Token authentication is enabled </h3> <p> If no password has been configured, you need to open the notebook server with its login token in the URL, or paste it above. This requirement will be lifted if you <b><a href='https://jupyter-notebook.readthedocs.io/en/stable/public_server.html'> enable a password</a></b>. </p> <p> The command: <pre>jupyter notebook list</pre> will show you the URLs of running servers with their tokens, which you can copy and paste into your browser. For example: </p> <pre>Currently running servers: http://localhost:8888/?token=c8de56fa... :: /Users/you/notebooks </pre> <p> or you can paste just the token value into the password field on this page. </p> <p> See <b><a href='https://jupyter-notebook.readthedocs.io/en/stable/public_server.html'> the documentation on how to enable a password</a> </b> in place of token authentication, if you would like to avoid dealing with random tokens. </p> <p> Cookies are required for authenticated access to notebooks. </p> <h3>Setup a Password</h3> <p> You can also setup a password by entering your token and a new password on the fields below:</p> <form action="/login?next=%2Fview%2Fcrawlme.html" method="post" class=""> <input type="hidden" name="_xsrf" value="2|ddfea313|117b8c4dda745abc7f40ce62064e53c7|1577542029"/> <div class="form-group"> <label for="token_input"><h4>Token</h4></label> <input type="password" name="password" id="token_input" class="form-control"> </div> <div class="form-group"> <label for="new_password_input"><h4>New Password</h4></label> <input type="password" name="new_password" id="new_password_input" class="form-control" required> </div> <div class="form-group"> <button type="submit" class="btn btn-default" id="login_new_pass_submit">Log in and set new password</button> </div> </form> </div> </div> </div> <script type="text/javascript"> require(["auth/js/main"], function (auth) { auth.login_main(); }); </script> <script type='text/javascript'> function _remove_token_from_url() { if (window.location.search.length <= 1) { return; } var search_parameters = window.location.search.slice(1).split('&'); for (var i = 0; i < search_parameters.length; i++) { if (search_parameters[i].split('=')[0] === 'token') { // remote token from search parameters search_parameters.splice(i, 1); var new_search = ''; if (search_parameters.length) { new_search = '?' + search_parameters.join('&'); } var new_url = window.location.origin + window.location.pathname + new_search + window.location.hash; window.history.replaceState({}, "", new_url); return; } } } _remove_token_from_url(); </script> </body> </html> UTF-8 http://localhost:8888/login?next=%2Fview%2Fcrawlme.html 200
正则表达式
import re
re.findall(r'\bf[a-z]*', 'which foot or hand fell fastest')
['foot', 'fell', 'fastest']
re.sub(r'(\b[a-z]+) \1', r'\1', 'cat in the the hat')
'cat in the hat'
html = '''\
<html>
<head>
</head>
<body>
<h1>Title</h1>
<p> This is a paragraph</p>
<table>
<tr>
<td> <a href='a.html'>link a</a> </td>
</tr>
</table>
<a href='b.html'>link b</a>
</body>
</html>
'''
re.findall("href='(.*?)'", html)
['a.html', 'b.html']
re.match(r'ab*', 'abbbb').group()
'abbbb'
re.match(r'ab*?', 'abbbb').group()
'a'
re.match(r'a\d{3,4}', 'a1234567').group()
'a1234'
#使用|表示或,\|表示字面值
re.match(r'(abc)|(def)', 'abc').group()
'abc'
p = re.compile('\d+')
txt = '12 drummers drumming, 11 pipers piping, 10 lords a-leaping'
p.findall(txt)
['12', '11', '10']
p = re.compile('(a(b)c)d')
m = p.match('abcd')
m.group(0)
'abcd'
m.group(1)
'abc'
m.group(2)
'b'
re.findall(r'(.)\1', '明明亮亮蛋蛋')
['明', '亮', '蛋']