import requests, bs4
url = "http://oscar-lab.org/people/~zren/files/puzzle.html"
res = requests.get(url)
res.encoding = res.apparent_encoding
doc = bs4.BeautifulSoup(res.text)
print(doc.prettify())
<!DOCTYPE html>
<html>
<body>
<h1>
Today's news (Adapted from
<a href="https://news.ycombinator.com">
Hacker's News
</a>
)
</h1>
<table border="0">
<thead>
<tr>
<th>
ID
</th>
<th>
Score
</th>
<th>
Title
</th>
</tr>
</thead>
<tbody>
<tr>
<td>
1
</td>
<td>
1
</td>
<td>
<a href="https://docs.google.com/document/d/155yNpfR7dGKuN-4rbrvbJLcJkhGa_HqvVuyPK7UEfPo/edit">
Reverse engineering YouTube demonetization algorithm
</a>
</td>
</tr>
<tr>
<td>
2
</td>
<td>
9
</td>
<td>
<a href="https://github.com/laurent22/joplin/">
Joplin A note-taking and to-do app with builds for desktop, mobile, terminal
</a>
</td>
</tr>
<tr>
<td>
3
</td>
<td>
4
</td>
<td>
<a href="https://motherboard.vice.com/en_us/article/ywnmkk/coinbase-irs-14000-bitcoin-tax">
Coinbase Ordered to Turn Over Identities of 14,355 Crypto Traders to the IRS
</a>
</td>
</tr>
<tr>
<td>
4
</td>
<td>
8
</td>
<td>
<a href="https://arstechnica.com/information-technology/2017/11/australian-man-uses-snack-bags-as-faraday-cage-to-block-tracking-by-employer/">
Australian uses snack bags as Faraday cage to block tracking by employer
</a>
</td>
</tr>
<tr>
<td>
5
</td>
<td>
3
</td>
<td>
<a href="https://jontysinai.github.io">
A blog I started on Neural Networks and Probability
</a>
</td>
</tr>
<tr>
<td>
6
</td>
<td>
2
</td>
<td>
<a href="https://www.bloomberg.com/news/articles/2017-11-30/so-it-looks-like-nobel-economics-laureates-don-t-like-bitcoin">
It Looks Like Nobel Economics Laureates Don't Like Bitcoin
</a>
</td>
</tr>
<tr>
<td>
7
</td>
<td>
7
</td>
<td>
<a href="http://www.lowrisc.org/blog/2017/11/seventh-risc-v-workshop-day-two/">
Seventh RISC-V Workshop: Day Two - LowRISC
</a>
</td>
</tr>
<tr>
<td>
8
</td>
<td>
6
</td>
<td>
<a href="http://www.spiegel.de/international/0,1518,433134,00.html">
China's Art Factories: Van Gogh from the Sweatshop (2006)
</a>
</td>
</tr>
<tr>
<td>
9
</td>
<td>
5
</td>
<td>
<a href="https://www.nytimes.com/2017/11/29/business/waymo-uber-trial.html">
Judge Tells Uber Lawyer: 'It Looks Like You Covered This Up'
</a>
</td>
</tr>
<tr>
<td>
10
</td>
<td>
0
</td>
<td>
<a href="http://plumshell.com/2017/11/30/as-a-solo-app-developer-i-decided-to-offer-phone-support-and-this-is-what-happened/">
As a solo developer, I decided to offer phone support
</a>
</td>
</tr>
<tr>
<td>
11
</td>
<td>
1
</td>
<td>
<a href="http://techcrunch.com/2016/10/27/why-did-protonmail-vanish-from-google-search-results-for-months/amp/">
Why Did ProtonMail Vanish from Google Search Results?
</a>
</td>
</tr>
<tr>
<td>
12
</td>
<td>
6
</td>
<td>
<a href="https://blog.pocketcluster.io/2017/11/30/weekly-machine-learning-opensource-roundup-nov-30-2017/">
Weekly Machine Learning Toolset and Library Roundup - Nov. 30, 2017
</a>
</td>
</tr>
<tr>
<td>
13
</td>
<td>
8
</td>
<td>
<a href="https://www.space.com/38912-pulsar-discovery-by-jocelyn-bell.html">
50 Years Ago Jocelyn Bell Discovered Pulsars
</a>
</td>
</tr>
<tr>
<td>
14
</td>
<td>
6
</td>
<td>
<a href="https://adventofcode.com/2017">
Advent of Code 2017
</a>
</td>
</tr>
<tr>
<td>
15
</td>
<td>
9
</td>
<td>
<a href="item?id=15815913">
SafeButler (YC S17) is hiring employee #2 to modernize insurance
</a>
</td>
</tr>
<tr>
<td>
16
</td>
<td>
4
</td>
<td>
<a href="https://twitter.com/4Dgifts/status/936223487986946048">
BTC addresses whose private keys are from Sha256 of another public address
</a>
</td>
</tr>
<tr>
<td>
17
</td>
<td>
5
</td>
<td>
<a href="http://phasenoise.livejournal.com/2017/11/3185.html">
Review and Teardown of a Cheap GPS Jammer
</a>
</td>
</tr>
<tr>
<td>
18
</td>
<td>
4
</td>
<td>
<a href="https://blog.coinbase.com/coinbase-obtains-partial-victory-over-irs-dac041db59a3">
Coinbase Obtains Partial Victory Over IRS
</a>
</td>
</tr>
<tr>
<td>
19
</td>
<td>
2
</td>
<td>
<a href="http://www.danielwilczynski.com/2017/11/29/bitcoin-price-index/">
How to Profit from Bitcoin Bubble
</a>
</td>
</tr>
<tr>
<td>
20
</td>
<td>
7
</td>
<td>
<a href="https://fwupd.org/">
Linux Vendor Firmware Service
</a>
</td>
</tr>
<tr>
<td>
21
</td>
<td>
8
</td>
<td>
<a href="http://www.writethedocs.org/guide/writing/mindshare/">
Building mindshare in a company
</a>
</td>
</tr>
<tr>
<td>
22
</td>
<td>
2
</td>
<td>
<a href="https://www.wired.com/story/dazzle-camouflage-san-diego-world-war-i/">
How Cubism Protected Warships in WorldWar I
</a>
</td>
</tr>
<tr>
<td>
23
</td>
<td>
3
</td>
<td>
<a href="http://publicdomainreview.org/collections/the-model-book-of-calligraphy-1561-1596/">
The Model Book of Calligraphy (1561-1596)
</a>
</td>
</tr>
<tr>
<td>
24
</td>
<td>
4
</td>
<td>
<a href="https://shkspr.mobi/blog/2017/11/how-do-you-move-out-of-a-smarthome/">
How do you move out of a smarthome?
</a>
</td>
</tr>
<tr>
<td>
25
</td>
<td>
7
</td>
<td>
<a href="http://www.animationmagazine.net/people/software-giant-autodesk-to-axe-13-of-global-workforce/">
Software Giant Autodesk to Axe 13% of Global Workforce
</a>
</td>
</tr>
<tr>
<td>
26
</td>
<td>
8
</td>
<td>
<a href="http://www.brendangregg.com/blog/2017-11-29/aws-ec2-virtualization-2017.html">
AWS EC2 Virtualization 2017: Including Nitro
</a>
</td>
</tr>
<tr>
<td>
27
</td>
<td>
6
</td>
<td>
<a href="http://www.bbc.co.uk/news/technology-42166089">
Google faces UK legal action for bypassing iPhone privacy settings to target ads
</a>
</td>
</tr>
<tr>
<td>
28
</td>
<td>
3
</td>
<td>
<a href="https://www.indexventures.com/optionplan">
OptionPlan, an app for founders looking to design a stock option plan
</a>
</td>
</tr>
<tr>
<td>
29
</td>
<td>
9
</td>
<td>
<a href="http://www.loper-os.org/?p=1927">
The Peculiarly Quiet Decline and Fall of the KVM
</a>
</td>
</tr>
<tr>
<td>
30
</td>
<td>
1
</td>
<td>
<a href="https://www.gamingonlinux.com/articles/nvidia-has-confirmed-a-driver-bug-resulting-in-a-loss-of-performance-on-linux.10804">
Nvidia has confirmed a driver bug resulting in a loss of performance on Linux
</a>
</td>
</tr>
</tbody>
</table>
</body>
</html>
news = []
for tr in doc.tbody.find_all("tr"):
tds = tr.find_all("td")
ID = tds[0].text
score = tds[1].text
title = tds[2].text
url = tds[2].a['href']
news.append({"ID": ID,
"score": int(score),
"title": title,
"url": url})
higher_5 = [item['url'] for item in news if item['score'] > 5]
print(higher_5)
['https://github.com/laurent22/joplin/', 'https://arstechnica.com/information-technology/2017/11/australian-man-uses-snack-bags-as-faraday-cage-to-block-tracking-by-employer/', 'http://www.lowrisc.org/blog/2017/11/seventh-risc-v-workshop-day-two/', 'http://www.spiegel.de/international/0,1518,433134,00.html', 'https://blog.pocketcluster.io/2017/11/30/weekly-machine-learning-opensource-roundup-nov-30-2017/', 'https://www.space.com/38912-pulsar-discovery-by-jocelyn-bell.html', 'https://adventofcode.com/2017', 'item?id=15815913', 'https://fwupd.org/', 'http://www.writethedocs.org/guide/writing/mindshare/', 'http://www.animationmagazine.net/people/software-giant-autodesk-to-axe-13-of-global-workforce/', 'http://www.brendangregg.com/blog/2017-11-29/aws-ec2-virtualization-2017.html', 'http://www.bbc.co.uk/news/technology-42166089', 'http://www.loper-os.org/?p=1927']
import csv
handle = open("news.csv", "w", newline="")
writer = csv.writer(handle)
header = ['ID', 'score', 'title', 'url']
rows = [[item[header[i]] for i in range(4)] for item in news]
writer.writerows([header] + rows)
handle.close()
import json
handle = open("news.json", "w")
json.dump(news, handle)
handle.close()