Web Scraping
Using Beautiful Soup & Python
To get the HTML of the page using Python Requests Library
import requests
webpage = requests.get('https://www.codecademy.com/articles/http-requests')
print(webpage.text)
web_content = webpage.content # use.content
print(web_content)
## TUTORIAL
import requests
from bs4 import BeautifulSoup
#Breaks HTML page into diff types of objects
webpage_response = requests.get('https://content.codecademy.com/courses/beautifulsoup/shellter.html')
#get HTML content
webpage = webpage_response.content
#parse the HTML to readable form
soup = BeautifulSoup(webpage, "html.parser")
print(soup)
print(soup.p) # the p stands for the name of the ID HTML tag (etc. soup.div)
# soup.p.name - get name of tag
# soup.p.attrs - prints the dict representing attributes of the tag
# soup.p.string - get string inside tag
for child in soup.ul.children: #gets the children of the ul tag - subcontents
print(child) # similarly .parent will navigate up the tree
print(soup.find_all("h1"))
print(soup.find_all(["h1", "a", "p"]))
#find_all() finds all occurences of a tag, in this case h1
soup.find_all(attrs={'class':'banner'})
# to find <div class="banner">Hello world</div>
def has_banner_class_and_hello_world(tag):
return tag.attr('class') == "banner" and tag.string == "Hello world"
soup.find_all(has_banner_class_and_hello_world)
# CSS SELECTORS - If we wanted to select all of the elements that have the class 'recipeLink'
soup.select(".recipeLink")
#Let’s say we wanted to loop through all of the links to these funfetti recipes that we found from our search.
for link in soup.select(".recipeLink > a"):
webpage = requests.get(link)
new_soup = BeautifulSoup(webpage)
# To READ text in HTML
soup.get_text()
#EXAMPLE -------------------------------------
import requests
from bs4 import BeautifulSoup
import pandas as pd
prefix = "https://content.codecademy.com/courses/beautifulsoup/"
webpage_response = requests.get('https://content.codecademy.com/courses/beautifulsoup/shellter.html')
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
turtle_links = soup.find_all("a")
links = []
#go through all of the a tags and get the links associated with them"
for a in turtle_links:
links.append(prefix+a["href"])
#Define turtle_data:
turtle_data = {}
#follow each link:
for link in links:
webpage = requests.get(link)
turtle = BeautifulSoup(webpage.content, "html.parser")
turtle_name = turtle.select(".name")[0].get_text()
stats = turtle.find("ul")
stats_text = stats.get_text("|")
turtle_data[turtle_name] = stats_text.split("|")
turtle_df = pd.DataFrame.from_dict(turtle_data, orient='index')
print(turtle_df)
Last updated