In [1]:
import requests, bs4
url = "http://oscar-lab.org/people/~zren/files/puzzle.html"
res = requests.get(url)
In [2]:
res.encoding = res.apparent_encoding
doc = bs4.BeautifulSoup(res.text)
In [3]:
print(doc.prettify())
<!DOCTYPE html>
<html>
 <body>
  <h1>
   Today's news (Adapted from
   <a href="https://news.ycombinator.com">
    Hacker's News
   </a>
   )
  </h1>
  <table border="0">
   <thead>
    <tr>
     <th>
      ID
     </th>
     <th>
      Score
     </th>
     <th>
      Title
     </th>
    </tr>
   </thead>
   <tbody>
    <tr>
     <td>
      1
     </td>
     <td>
      1
     </td>
     <td>
      <a href="https://docs.google.com/document/d/155yNpfR7dGKuN-4rbrvbJLcJkhGa_HqvVuyPK7UEfPo/edit">
       Reverse engineering YouTube demonetization algorithm
      </a>
     </td>
    </tr>
    <tr>
     <td>
      2
     </td>
     <td>
      9
     </td>
     <td>
      <a href="https://github.com/laurent22/joplin/">
       Joplin A note-taking and to-do app with builds for desktop, mobile, terminal
      </a>
     </td>
    </tr>
    <tr>
     <td>
      3
     </td>
     <td>
      4
     </td>
     <td>
      <a href="https://motherboard.vice.com/en_us/article/ywnmkk/coinbase-irs-14000-bitcoin-tax">
       Coinbase Ordered to Turn Over Identities of 14,355 Crypto Traders to the IRS
      </a>
     </td>
    </tr>
    <tr>
     <td>
      4
     </td>
     <td>
      8
     </td>
     <td>
      <a href="https://arstechnica.com/information-technology/2017/11/australian-man-uses-snack-bags-as-faraday-cage-to-block-tracking-by-employer/">
       Australian uses snack bags as Faraday cage to block tracking by employer
      </a>
     </td>
    </tr>
    <tr>
     <td>
      5
     </td>
     <td>
      3
     </td>
     <td>
      <a href="https://jontysinai.github.io">
       A blog I started on Neural Networks and Probability
      </a>
     </td>
    </tr>
    <tr>
     <td>
      6
     </td>
     <td>
      2
     </td>
     <td>
      <a href="https://www.bloomberg.com/news/articles/2017-11-30/so-it-looks-like-nobel-economics-laureates-don-t-like-bitcoin">
       It Looks Like Nobel Economics Laureates Don't Like Bitcoin
      </a>
     </td>
    </tr>
    <tr>
     <td>
      7
     </td>
     <td>
      7
     </td>
     <td>
      <a href="http://www.lowrisc.org/blog/2017/11/seventh-risc-v-workshop-day-two/">
       Seventh RISC-V Workshop: Day Two - LowRISC
      </a>
     </td>
    </tr>
    <tr>
     <td>
      8
     </td>
     <td>
      6
     </td>
     <td>
      <a href="http://www.spiegel.de/international/0,1518,433134,00.html">
       China's Art Factories: Van Gogh from the Sweatshop (2006)
      </a>
     </td>
    </tr>
    <tr>
     <td>
      9
     </td>
     <td>
      5
     </td>
     <td>
      <a href="https://www.nytimes.com/2017/11/29/business/waymo-uber-trial.html">
       Judge Tells Uber Lawyer: 'It Looks Like You Covered This Up'
      </a>
     </td>
    </tr>
    <tr>
     <td>
      10
     </td>
     <td>
      0
     </td>
     <td>
      <a href="http://plumshell.com/2017/11/30/as-a-solo-app-developer-i-decided-to-offer-phone-support-and-this-is-what-happened/">
       As a solo developer, I decided to offer phone support
      </a>
     </td>
    </tr>
    <tr>
     <td>
      11
     </td>
     <td>
      1
     </td>
     <td>
      <a href="http://techcrunch.com/2016/10/27/why-did-protonmail-vanish-from-google-search-results-for-months/amp/">
       Why Did ProtonMail Vanish from Google Search Results?
      </a>
     </td>
    </tr>
    <tr>
     <td>
      12
     </td>
     <td>
      6
     </td>
     <td>
      <a href="https://blog.pocketcluster.io/2017/11/30/weekly-machine-learning-opensource-roundup-nov-30-2017/">
       Weekly Machine Learning Toolset and Library Roundup - Nov. 30, 2017
      </a>
     </td>
    </tr>
    <tr>
     <td>
      13
     </td>
     <td>
      8
     </td>
     <td>
      <a href="https://www.space.com/38912-pulsar-discovery-by-jocelyn-bell.html">
       50 Years Ago Jocelyn Bell Discovered Pulsars
      </a>
     </td>
    </tr>
    <tr>
     <td>
      14
     </td>
     <td>
      6
     </td>
     <td>
      <a href="https://adventofcode.com/2017">
       Advent of Code 2017
      </a>
     </td>
    </tr>
    <tr>
     <td>
      15
     </td>
     <td>
      9
     </td>
     <td>
      <a href="item?id=15815913">
       SafeButler (YC S17) is hiring employee #2 to modernize insurance
      </a>
     </td>
    </tr>
    <tr>
     <td>
      16
     </td>
     <td>
      4
     </td>
     <td>
      <a href="https://twitter.com/4Dgifts/status/936223487986946048">
       BTC addresses whose private keys are from Sha256 of another public address
      </a>
     </td>
    </tr>
    <tr>
     <td>
      17
     </td>
     <td>
      5
     </td>
     <td>
      <a href="http://phasenoise.livejournal.com/2017/11/3185.html">
       Review and Teardown of a Cheap GPS Jammer
      </a>
     </td>
    </tr>
    <tr>
     <td>
      18
     </td>
     <td>
      4
     </td>
     <td>
      <a href="https://blog.coinbase.com/coinbase-obtains-partial-victory-over-irs-dac041db59a3">
       Coinbase Obtains Partial Victory Over IRS
      </a>
     </td>
    </tr>
    <tr>
     <td>
      19
     </td>
     <td>
      2
     </td>
     <td>
      <a href="http://www.danielwilczynski.com/2017/11/29/bitcoin-price-index/">
       How to Profit from Bitcoin Bubble
      </a>
     </td>
    </tr>
    <tr>
     <td>
      20
     </td>
     <td>
      7
     </td>
     <td>
      <a href="https://fwupd.org/">
       Linux Vendor Firmware Service
      </a>
     </td>
    </tr>
    <tr>
     <td>
      21
     </td>
     <td>
      8
     </td>
     <td>
      <a href="http://www.writethedocs.org/guide/writing/mindshare/">
       Building mindshare in a company
      </a>
     </td>
    </tr>
    <tr>
     <td>
      22
     </td>
     <td>
      2
     </td>
     <td>
      <a href="https://www.wired.com/story/dazzle-camouflage-san-diego-world-war-i/">
       How Cubism Protected Warships in WorldWar I
      </a>
     </td>
    </tr>
    <tr>
     <td>
      23
     </td>
     <td>
      3
     </td>
     <td>
      <a href="http://publicdomainreview.org/collections/the-model-book-of-calligraphy-1561-1596/">
       The Model Book of Calligraphy (1561-1596)
      </a>
     </td>
    </tr>
    <tr>
     <td>
      24
     </td>
     <td>
      4
     </td>
     <td>
      <a href="https://shkspr.mobi/blog/2017/11/how-do-you-move-out-of-a-smarthome/">
       How do you move out of a smarthome?
      </a>
     </td>
    </tr>
    <tr>
     <td>
      25
     </td>
     <td>
      7
     </td>
     <td>
      <a href="http://www.animationmagazine.net/people/software-giant-autodesk-to-axe-13-of-global-workforce/">
       Software Giant Autodesk to Axe 13% of Global Workforce
      </a>
     </td>
    </tr>
    <tr>
     <td>
      26
     </td>
     <td>
      8
     </td>
     <td>
      <a href="http://www.brendangregg.com/blog/2017-11-29/aws-ec2-virtualization-2017.html">
       AWS EC2 Virtualization 2017: Including Nitro
      </a>
     </td>
    </tr>
    <tr>
     <td>
      27
     </td>
     <td>
      6
     </td>
     <td>
      <a href="http://www.bbc.co.uk/news/technology-42166089">
       Google faces UK legal action for bypassing iPhone privacy settings to target ads
      </a>
     </td>
    </tr>
    <tr>
     <td>
      28
     </td>
     <td>
      3
     </td>
     <td>
      <a href="https://www.indexventures.com/optionplan">
       OptionPlan, an app for founders looking to design a stock option plan
      </a>
     </td>
    </tr>
    <tr>
     <td>
      29
     </td>
     <td>
      9
     </td>
     <td>
      <a href="http://www.loper-os.org/?p=1927">
       The Peculiarly Quiet Decline and Fall of the KVM
      </a>
     </td>
    </tr>
    <tr>
     <td>
      30
     </td>
     <td>
      1
     </td>
     <td>
      <a href="https://www.gamingonlinux.com/articles/nvidia-has-confirmed-a-driver-bug-resulting-in-a-loss-of-performance-on-linux.10804">
       Nvidia has confirmed a driver bug resulting in a loss of performance on Linux
      </a>
     </td>
    </tr>
   </tbody>
  </table>
 </body>
</html>

In [4]:
news = []
for tr in doc.tbody.find_all("tr"):
    tds = tr.find_all("td")
    ID = tds[0].text
    score = tds[1].text
    title = tds[2].text
    url = tds[2].a['href']
    news.append({"ID": ID,
                 "score": int(score),
                 "title": title,
                 "url": url})
In [5]:
higher_5 = [item['url'] for item in news if item['score'] > 5]
print(higher_5)
['https://github.com/laurent22/joplin/', 'https://arstechnica.com/information-technology/2017/11/australian-man-uses-snack-bags-as-faraday-cage-to-block-tracking-by-employer/', 'http://www.lowrisc.org/blog/2017/11/seventh-risc-v-workshop-day-two/', 'http://www.spiegel.de/international/0,1518,433134,00.html', 'https://blog.pocketcluster.io/2017/11/30/weekly-machine-learning-opensource-roundup-nov-30-2017/', 'https://www.space.com/38912-pulsar-discovery-by-jocelyn-bell.html', 'https://adventofcode.com/2017', 'item?id=15815913', 'https://fwupd.org/', 'http://www.writethedocs.org/guide/writing/mindshare/', 'http://www.animationmagazine.net/people/software-giant-autodesk-to-axe-13-of-global-workforce/', 'http://www.brendangregg.com/blog/2017-11-29/aws-ec2-virtualization-2017.html', 'http://www.bbc.co.uk/news/technology-42166089', 'http://www.loper-os.org/?p=1927']
In [6]:
import csv
handle = open("news.csv", "w", newline="")
writer = csv.writer(handle)
In [7]:
header = ['ID', 'score', 'title', 'url']
rows = [[item[header[i]] for i in range(4)] for item in news]
writer.writerows([header] + rows)
handle.close()
In [8]:
import json
handle = open("news.json", "w")
json.dump(news, handle)
handle.close()
In [ ]: