Web Scraping

Using Beautiful Soup & Python

To get the HTML of the page using Python Requests Library

import requests
 
webpage = requests.get('https://www.codecademy.com/articles/http-requests')
print(webpage.text)

web_content = webpage.content # use.content
print(web_content)
## TUTORIAL
import requests
from bs4 import BeautifulSoup
#Breaks HTML page into diff types of objects

webpage_response = requests.get('https://content.codecademy.com/courses/beautifulsoup/shellter.html')

#get HTML content
webpage = webpage_response.content
#parse the HTML to readable form
soup = BeautifulSoup(webpage, "html.parser")

print(soup)
print(soup.p) # the p stands for the name of the ID HTML tag (etc. soup.div)
# soup.p.name - get name of tag
# soup.p.attrs - prints the dict representing attributes of the tag
# soup.p.string - get string inside tag
for child in soup.ul.children: #gets the children of the ul tag - subcontents
    print(child) # similarly .parent will navigate up the tree

print(soup.find_all("h1"))
print(soup.find_all(["h1", "a", "p"]))
#find_all() finds all occurences of a tag, in this case h1
soup.find_all(attrs={'class':'banner'})

# to find <div class="banner">Hello world</div>
def has_banner_class_and_hello_world(tag):
    return tag.attr('class') == "banner" and tag.string == "Hello world"
 
soup.find_all(has_banner_class_and_hello_world)

# CSS SELECTORS - If we wanted to select all of the elements that have the class 'recipeLink'
soup.select(".recipeLink")

#Let’s say we wanted to loop through all of the links to these funfetti recipes that we found from our search.
for link in soup.select(".recipeLink > a"):
  webpage = requests.get(link)
  new_soup = BeautifulSoup(webpage)


# To READ text in HTML
soup.get_text()

#EXAMPLE -------------------------------------
import requests
from bs4 import BeautifulSoup
import pandas as pd

prefix = "https://content.codecademy.com/courses/beautifulsoup/"
webpage_response = requests.get('https://content.codecademy.com/courses/beautifulsoup/shellter.html')

webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")

turtle_links = soup.find_all("a")
links = []
#go through all of the a tags and get the links associated with them"
for a in turtle_links:
  links.append(prefix+a["href"])
    
#Define turtle_data:
turtle_data = {}

#follow each link:
for link in links:
  webpage = requests.get(link)
  turtle = BeautifulSoup(webpage.content, "html.parser")
  turtle_name = turtle.select(".name")[0].get_text()
  
  stats = turtle.find("ul")
  stats_text = stats.get_text("|")
  turtle_data[turtle_name] = stats_text.split("|")

turtle_df = pd.DataFrame.from_dict(turtle_data, orient='index')
print(turtle_df)

Last updated