Creative Web Automation: Generate Ad-Free Study Notes

How to use an automated script to grab only the text content off online study notes

Automation Design

Implementation

links = driver.find_elements(:xpath, "//div[@data-jump='summary']//li/a[@class='landing-page__umbrella__link' and starts-with(text(), 'Act')]")section_links = links.collect { |x| x["href"] }
the_html = driver.page_source
puts the_html.size # verify HTML was obtained
File.open("tmp.html", "w").write(the_html)
# use Nokogiri to parse HTML
require 'nokogiri'
the_html = File.read("/tmp/section-0.html")
doc = Nokogiri::HTML(the_html)
the_pure_content_html = "<html><body>"
elem_main_content = doc.xpath("//div[contains(@class, 'mainTextContent')]")
# only keep h3 and p tagged elements
elem_main_content.children.each_with_index do |x, idx|
if x.name == "h3"
the_pure_content_html += ("\n<br/><h3>" + x + "</h3>\n")
elsif x.name == "p"
the_pure_content_html += ("\n<p>" + x + "</p>\n")
end
end
the_pure_content_html += "\n</body></html>"
Additional “Read More” link
# revised filter
if x.name == "h3"
the_pure_content_html += ("\n<br/><h3>" + x + "</h3>\n")
elsif x.name == "p"
if x.to_s.include?("<p><a href") || x.to_s.include?("<p><span")
# skip additional links
else
the_pure_content_html += ("\n<p>" + x + "</p>\n")
end
end
the_pure_content_html = "<html><body>"
8.times do |idx|
doc = Nokogiri::HTML(File.read("/tmp/section-#{idx}.html")
# ...
the_pure_content_html += ... # see above
end
the_pure_content_html += "\n</body></html>"
File.open("/tmp/clean-vers.html", "w").write(the_pure_content_html)
the_pure_content_html = "<html><head>
<style>
body {background-color: #FFF;
font-family: Verdana, Helvetica, Arial;
font-size: 14px; }
h3 {font-size: 15px; color: blue;}
</style>
</head><body>\n"

Complete Script

it "Download Macbeth Sparknotes" do
driver.get("https://www.sparknotes.com/shakespeare/macbeth")
# Main Page, get all section links
links = driver.find_elements(:xpath, "//div[@data-jump='summary']//li/a[@class='landing-page__umbrella__link' and starts-with(text(), 'Act')]")
section_links = links.collect { |x| x["href"] }

section_links.each_with_index do |current_section, idx|
driver.get(current_section)
the_html = driver.page_source # get page html
File.open("/tmp/section-#{idx}.html", "w").write(the_html)
end
end
require 'nokogiri'
the_pure_content_html = "<html><head>
<style>
body {background-color: #FFF;
font-family: Verdana, Helvetica, Arial;
font-size: 14px;}
h3 {font-size: 15px; color: blue;}
</style>
</head><body>\n"
8.times do |idx|
the_html = File.read("/tmp/section-#{idx}.html")
doc = Nokogiri::HTML(the_html)
elem_main_content = doc.xpath("//div[contains(@class, 'mainTextContent')]")
elem_main_content.children.each_with_index do |x, idx|
if x.name == "h3"
the_pure_content_html += ("\n<br/><h3>" + x + "</h3>\n")
elsif x.name == "p"
if x.to_s.include?("<p><a href") || x.to_s.include?("<p><span")
# skip
else
the_pure_content_html += ("\n<p>" + x + "</p>\n")
end
end
end
end
the_pure_content_html += "\n</body></html>"
File.open("/tmp/clean-ver.html", "w").write(the_pure_content_html)

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store