import urllib.robotparser
url_prefix = "localhost:8888" # 根据当前地址栏的端口设置端口,jupyter默认为8888
rp = urllib.robotparser.RobotFileParser()
robots_url = "http://%s/view/robots.txt" % url_prefix
robots_url
'http://localhost:8888/view/robots.txt'
rp.set_url(robots_url)
rp.read()
rp.can_fetch("Baiduspider", "http://%s/" % url_prefix)
True
rp.can_fetch("Baiduspider", "http://%s/article/" % url_prefix)
True
import requests
r = requests.get("http://%s/view/crawlme.html" % url_prefix)
print(r.text)
print(r.encoding)
print(r.url)
print(r.status_code)
<!DOCTYPE HTML>
<html>
<head>
<meta charset="utf-8">
<title>Jupyter Notebook</title>
<link id="favicon" rel="shortcut icon" type="image/x-icon" href="/static/base/images/favicon.ico?v=97c6417ed01bdc0ae3ef32ae4894fd03">
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<link rel="stylesheet" href="/static/components/jquery-ui/themes/smoothness/jquery-ui.min.css?v=3c2a865c832a1322285c55c6ed99abb2" type="text/css" />
<link rel="stylesheet" href="/static/components/jquery-typeahead/dist/jquery.typeahead.min.css?v=7afb461de36accb1aa133a1710f5bc56" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="/static/style/style.min.css?v=e91a43337d7c294cc9fab2938fa723b3" type="text/css"/>
<link rel="stylesheet" href="/static/auth/css/override.css?v=ae668279c8a80c1d2e81fc28345169ee" type="text/css" />
<link rel="stylesheet" href="/custom/custom.css" type="text/css" />
<script src="/static/components/es6-promise/promise.min.js?v=f004a16cb856e0ff11781d01ec5ca8fe" type="text/javascript" charset="utf-8"></script>
<script src="/static/components/react/react.production.min.js?v=34f96ffc962a7deecc83037ccb582b58" type="text/javascript"></script>
<script src="/static/components/react/react-dom.production.min.js?v=b14d91fb641317cda38dbc9dbf985ab4" type="text/javascript"></script>
<script src="/static/components/create-react-class/index.js?v=94feb9971ce6d26211729abc43f96cd2" type="text/javascript"></script>
<script src="/static/components/requirejs/require.js?v=951f856e81496aaeec2e71a1c2c0d51f" type="text/javascript" charset="utf-8"></script>
<script>
require.config({
urlArgs: "v=20191228195158",
baseUrl: '/static/',
paths: {
'auth/js/main': 'auth/js/main.min',
custom : '/custom',
nbextensions : '/nbextensions',
kernelspecs : '/kernelspecs',
underscore : 'components/underscore/underscore-min',
backbone : 'components/backbone/backbone-min',
jed: 'components/jed/jed',
jquery: 'components/jquery/jquery.min',
json: 'components/requirejs-plugins/src/json',
text: 'components/requirejs-text/text',
bootstrap: 'components/bootstrap/dist/js/bootstrap.min',
bootstraptour: 'components/bootstrap-tour/build/js/bootstrap-tour.min',
'jquery-ui': 'components/jquery-ui/jquery-ui.min',
moment: 'components/moment/min/moment-with-locales',
codemirror: 'components/codemirror',
termjs: 'components/xterm.js/xterm',
typeahead: 'components/jquery-typeahead/dist/jquery.typeahead.min',
},
map: { // for backward compatibility
"*": {
"jqueryui": "jquery-ui",
}
},
shim: {
typeahead: {
deps: ["jquery"],
exports: "typeahead"
},
underscore: {
exports: '_'
},
backbone: {
deps: ["underscore", "jquery"],
exports: "Backbone"
},
bootstrap: {
deps: ["jquery"],
exports: "bootstrap"
},
bootstraptour: {
deps: ["bootstrap"],
exports: "Tour"
},
"jquery-ui": {
deps: ["jquery"],
exports: "$"
}
},
waitSeconds: 30,
});
require.config({
map: {
'*':{
'contents': 'services/contents',
}
}
});
// error-catching custom.js shim.
define("custom", function (require, exports, module) {
try {
var custom = require('custom/custom');
console.debug('loaded custom.js');
return custom;
} catch (e) {
console.error("error loading custom.js", e);
return {};
}
})
document.nbjs_translations = {"domain": "nbjs", "locale_data": {"nbjs": {"": {"domain": "nbjs"}}}};
document.documentElement.lang = navigator.language.toLowerCase();
</script>
</head>
<body class=""
dir="ltr">
<noscript>
<div id='noscript'>
Jupyter Notebook需要的JavaScript.<br>
请允许它继续.
</div>
</noscript>
<div id="header" role="navigation" aria-label="Top Menu">
<div id="header-container" class="container">
<div id="ipython_notebook" class="nav navbar-brand"><a href="/tree" title='指示板'>
<img src='/static/base/images/logo.png?v=641991992878ee24c6f3826e81054a0f' alt='Jupyter Notebook'/>
</a></div>
</div>
<div class="header-bar"></div>
</div>
<div id="site">
<div id="ipython-main-app" class="container">
<div class="row">
<div class="navbar col-sm-8">
<div class="navbar-inner">
<div class="container">
<div class="center-nav">
<form action="/login?next=%2Fview%2Fcrawlme.html" method="post" class="navbar-form pull-left">
<input type="hidden" name="_xsrf" value="2|ddfea313|117b8c4dda745abc7f40ce62064e53c7|1577542029"/>
<label for="password_input"><strong>密码或者token:</strong></label>
<input type="password" name="password" id="password_input" class="form-control">
<button type="submit" class="btn btn-default" id="login_submit">登录</button>
</form>
</div>
</div>
</div>
</div>
</div>
<div class="col-sm-6 col-sm-offset-3 text-left rendered_html">
<h3>
Token authentication is enabled
</h3>
<p>
If no password has been configured, you need to open the notebook
server with its login token in the URL, or paste it above.
This requirement will be lifted if you
<b><a href='https://jupyter-notebook.readthedocs.io/en/stable/public_server.html'>
enable a password</a></b>.
</p>
<p>
The command:
<pre>jupyter notebook list</pre>
will show you the URLs of running servers with their tokens,
which you can copy and paste into your browser. For example:
</p>
<pre>Currently running servers:
http://localhost:8888/?token=c8de56fa... :: /Users/you/notebooks
</pre>
<p>
or you can paste just the token value into the password field on this
page.
</p>
<p>
See
<b><a
href='https://jupyter-notebook.readthedocs.io/en/stable/public_server.html'>
the documentation on how to enable a password</a>
</b>
in place of token authentication,
if you would like to avoid dealing with random tokens.
</p>
<p>
Cookies are required for authenticated access to notebooks.
</p>
<h3>Setup a Password</h3>
<p> You can also setup a password by entering your token and a new password
on the fields below:</p>
<form action="/login?next=%2Fview%2Fcrawlme.html" method="post" class="">
<input type="hidden" name="_xsrf" value="2|ddfea313|117b8c4dda745abc7f40ce62064e53c7|1577542029"/>
<div class="form-group">
<label for="token_input"><h4>Token</h4></label>
<input type="password" name="password" id="token_input" class="form-control">
</div>
<div class="form-group">
<label for="new_password_input"><h4>New Password</h4></label>
<input type="password" name="new_password" id="new_password_input" class="form-control" required>
</div>
<div class="form-group">
<button type="submit" class="btn btn-default" id="login_new_pass_submit">Log in and set new password</button>
</div>
</form>
</div>
</div>
</div>
<script type="text/javascript">
require(["auth/js/main"], function (auth) {
auth.login_main();
});
</script>
<script type='text/javascript'>
function _remove_token_from_url() {
if (window.location.search.length <= 1) {
return;
}
var search_parameters = window.location.search.slice(1).split('&');
for (var i = 0; i < search_parameters.length; i++) {
if (search_parameters[i].split('=')[0] === 'token') {
// remote token from search parameters
search_parameters.splice(i, 1);
var new_search = '';
if (search_parameters.length) {
new_search = '?' + search_parameters.join('&');
}
var new_url = window.location.origin +
window.location.pathname +
new_search +
window.location.hash;
window.history.replaceState({}, "", new_url);
return;
}
}
}
_remove_token_from_url();
</script>
</body>
</html>
UTF-8
http://localhost:8888/login?next=%2Fview%2Fcrawlme.html
200
正则表达式
import re
re.findall(r'\bf[a-z]*', 'which foot or hand fell fastest')
['foot', 'fell', 'fastest']
re.sub(r'(\b[a-z]+) \1', r'\1', 'cat in the the hat')
'cat in the hat'
html = '''\
<html>
<head>
</head>
<body>
<h1>Title</h1>
<p> This is a paragraph</p>
<table>
<tr>
<td> <a href='a.html'>link a</a> </td>
</tr>
</table>
<a href='b.html'>link b</a>
</body>
</html>
'''
re.findall("href='(.*?)'", html)
['a.html', 'b.html']
re.match(r'ab*', 'abbbb').group()
'abbbb'
re.match(r'ab*?', 'abbbb').group()
'a'
re.match(r'a\d{3,4}', 'a1234567').group()
'a1234'
#使用|表示或,\|表示字面值
re.match(r'(abc)|(def)', 'abc').group()
'abc'
p = re.compile('\d+')
txt = '12 drummers drumming, 11 pipers piping, 10 lords a-leaping'
p.findall(txt)
['12', '11', '10']
p = re.compile('(a(b)c)d')
m = p.match('abcd')
m.group(0)
'abcd'
m.group(1)
'abc'
m.group(2)
'b'
re.findall(r'(.)\1', '明明亮亮蛋蛋')
['明', '亮', '蛋']