import requests, bs4
url = "http://oscar-lab.org/people/~zren/files/puzzle.html"
res = requests.get(url)
res.encoding = res.apparent_encoding
doc = bs4.BeautifulSoup(res.text)
print(doc.prettify())
<!DOCTYPE html> <html> <body> <h1> Today's news (Adapted from <a href="https://news.ycombinator.com"> Hacker's News </a> ) </h1> <table border="0"> <thead> <tr> <th> ID </th> <th> Score </th> <th> Title </th> </tr> </thead> <tbody> <tr> <td> 1 </td> <td> 1 </td> <td> <a href="https://docs.google.com/document/d/155yNpfR7dGKuN-4rbrvbJLcJkhGa_HqvVuyPK7UEfPo/edit"> Reverse engineering YouTube demonetization algorithm </a> </td> </tr> <tr> <td> 2 </td> <td> 9 </td> <td> <a href="https://github.com/laurent22/joplin/"> Joplin A note-taking and to-do app with builds for desktop, mobile, terminal </a> </td> </tr> <tr> <td> 3 </td> <td> 4 </td> <td> <a href="https://motherboard.vice.com/en_us/article/ywnmkk/coinbase-irs-14000-bitcoin-tax"> Coinbase Ordered to Turn Over Identities of 14,355 Crypto Traders to the IRS </a> </td> </tr> <tr> <td> 4 </td> <td> 8 </td> <td> <a href="https://arstechnica.com/information-technology/2017/11/australian-man-uses-snack-bags-as-faraday-cage-to-block-tracking-by-employer/"> Australian uses snack bags as Faraday cage to block tracking by employer </a> </td> </tr> <tr> <td> 5 </td> <td> 3 </td> <td> <a href="https://jontysinai.github.io"> A blog I started on Neural Networks and Probability </a> </td> </tr> <tr> <td> 6 </td> <td> 2 </td> <td> <a href="https://www.bloomberg.com/news/articles/2017-11-30/so-it-looks-like-nobel-economics-laureates-don-t-like-bitcoin"> It Looks Like Nobel Economics Laureates Don't Like Bitcoin </a> </td> </tr> <tr> <td> 7 </td> <td> 7 </td> <td> <a href="http://www.lowrisc.org/blog/2017/11/seventh-risc-v-workshop-day-two/"> Seventh RISC-V Workshop: Day Two - LowRISC </a> </td> </tr> <tr> <td> 8 </td> <td> 6 </td> <td> <a href="http://www.spiegel.de/international/0,1518,433134,00.html"> China's Art Factories: Van Gogh from the Sweatshop (2006) </a> </td> </tr> <tr> <td> 9 </td> <td> 5 </td> <td> <a href="https://www.nytimes.com/2017/11/29/business/waymo-uber-trial.html"> Judge Tells Uber Lawyer: 'It Looks Like You Covered This Up' </a> </td> </tr> <tr> <td> 10 </td> <td> 0 </td> <td> <a href="http://plumshell.com/2017/11/30/as-a-solo-app-developer-i-decided-to-offer-phone-support-and-this-is-what-happened/"> As a solo developer, I decided to offer phone support </a> </td> </tr> <tr> <td> 11 </td> <td> 1 </td> <td> <a href="http://techcrunch.com/2016/10/27/why-did-protonmail-vanish-from-google-search-results-for-months/amp/"> Why Did ProtonMail Vanish from Google Search Results? </a> </td> </tr> <tr> <td> 12 </td> <td> 6 </td> <td> <a href="https://blog.pocketcluster.io/2017/11/30/weekly-machine-learning-opensource-roundup-nov-30-2017/"> Weekly Machine Learning Toolset and Library Roundup - Nov. 30, 2017 </a> </td> </tr> <tr> <td> 13 </td> <td> 8 </td> <td> <a href="https://www.space.com/38912-pulsar-discovery-by-jocelyn-bell.html"> 50 Years Ago Jocelyn Bell Discovered Pulsars </a> </td> </tr> <tr> <td> 14 </td> <td> 6 </td> <td> <a href="https://adventofcode.com/2017"> Advent of Code 2017 </a> </td> </tr> <tr> <td> 15 </td> <td> 9 </td> <td> <a href="item?id=15815913"> SafeButler (YC S17) is hiring employee #2 to modernize insurance </a> </td> </tr> <tr> <td> 16 </td> <td> 4 </td> <td> <a href="https://twitter.com/4Dgifts/status/936223487986946048"> BTC addresses whose private keys are from Sha256 of another public address </a> </td> </tr> <tr> <td> 17 </td> <td> 5 </td> <td> <a href="http://phasenoise.livejournal.com/2017/11/3185.html"> Review and Teardown of a Cheap GPS Jammer </a> </td> </tr> <tr> <td> 18 </td> <td> 4 </td> <td> <a href="https://blog.coinbase.com/coinbase-obtains-partial-victory-over-irs-dac041db59a3"> Coinbase Obtains Partial Victory Over IRS </a> </td> </tr> <tr> <td> 19 </td> <td> 2 </td> <td> <a href="http://www.danielwilczynski.com/2017/11/29/bitcoin-price-index/"> How to Profit from Bitcoin Bubble </a> </td> </tr> <tr> <td> 20 </td> <td> 7 </td> <td> <a href="https://fwupd.org/"> Linux Vendor Firmware Service </a> </td> </tr> <tr> <td> 21 </td> <td> 8 </td> <td> <a href="http://www.writethedocs.org/guide/writing/mindshare/"> Building mindshare in a company </a> </td> </tr> <tr> <td> 22 </td> <td> 2 </td> <td> <a href="https://www.wired.com/story/dazzle-camouflage-san-diego-world-war-i/"> How Cubism Protected Warships in WorldWar I </a> </td> </tr> <tr> <td> 23 </td> <td> 3 </td> <td> <a href="http://publicdomainreview.org/collections/the-model-book-of-calligraphy-1561-1596/"> The Model Book of Calligraphy (1561-1596) </a> </td> </tr> <tr> <td> 24 </td> <td> 4 </td> <td> <a href="https://shkspr.mobi/blog/2017/11/how-do-you-move-out-of-a-smarthome/"> How do you move out of a smarthome? </a> </td> </tr> <tr> <td> 25 </td> <td> 7 </td> <td> <a href="http://www.animationmagazine.net/people/software-giant-autodesk-to-axe-13-of-global-workforce/"> Software Giant Autodesk to Axe 13% of Global Workforce </a> </td> </tr> <tr> <td> 26 </td> <td> 8 </td> <td> <a href="http://www.brendangregg.com/blog/2017-11-29/aws-ec2-virtualization-2017.html"> AWS EC2 Virtualization 2017: Including Nitro </a> </td> </tr> <tr> <td> 27 </td> <td> 6 </td> <td> <a href="http://www.bbc.co.uk/news/technology-42166089"> Google faces UK legal action for bypassing iPhone privacy settings to target ads </a> </td> </tr> <tr> <td> 28 </td> <td> 3 </td> <td> <a href="https://www.indexventures.com/optionplan"> OptionPlan, an app for founders looking to design a stock option plan </a> </td> </tr> <tr> <td> 29 </td> <td> 9 </td> <td> <a href="http://www.loper-os.org/?p=1927"> The Peculiarly Quiet Decline and Fall of the KVM </a> </td> </tr> <tr> <td> 30 </td> <td> 1 </td> <td> <a href="https://www.gamingonlinux.com/articles/nvidia-has-confirmed-a-driver-bug-resulting-in-a-loss-of-performance-on-linux.10804"> Nvidia has confirmed a driver bug resulting in a loss of performance on Linux </a> </td> </tr> </tbody> </table> </body> </html>
news = []
for tr in doc.tbody.find_all("tr"):
tds = tr.find_all("td")
ID = tds[0].text
score = tds[1].text
title = tds[2].text
url = tds[2].a['href']
news.append({"ID": ID,
"score": int(score),
"title": title,
"url": url})
higher_5 = [item['url'] for item in news if item['score'] > 5]
print(higher_5)
['https://github.com/laurent22/joplin/', 'https://arstechnica.com/information-technology/2017/11/australian-man-uses-snack-bags-as-faraday-cage-to-block-tracking-by-employer/', 'http://www.lowrisc.org/blog/2017/11/seventh-risc-v-workshop-day-two/', 'http://www.spiegel.de/international/0,1518,433134,00.html', 'https://blog.pocketcluster.io/2017/11/30/weekly-machine-learning-opensource-roundup-nov-30-2017/', 'https://www.space.com/38912-pulsar-discovery-by-jocelyn-bell.html', 'https://adventofcode.com/2017', 'item?id=15815913', 'https://fwupd.org/', 'http://www.writethedocs.org/guide/writing/mindshare/', 'http://www.animationmagazine.net/people/software-giant-autodesk-to-axe-13-of-global-workforce/', 'http://www.brendangregg.com/blog/2017-11-29/aws-ec2-virtualization-2017.html', 'http://www.bbc.co.uk/news/technology-42166089', 'http://www.loper-os.org/?p=1927']
import csv
handle = open("news.csv", "w", newline="")
writer = csv.writer(handle)
header = ['ID', 'score', 'title', 'url']
rows = [[item[header[i]] for i in range(4)] for item in news]
writer.writerows([header] + rows)
handle.close()
import json
handle = open("news.json", "w")
json.dump(news, handle)
handle.close()