BeautifulSoup Examples

#/usr/bin/python3 # ========================================================== # Given a web page (file) # a. find the <title> tag contents if any # b. find the first <a> tag's contents if any # ========================================================== from bs4 import BeautifulSoup # ---- process a HTML file with open('zz3.html','r') as file: soup = BeautifulSoup(file,'html.parser') # ---- find multiple HTML tags for attr in ['title','a']: attr_found = soup.find(attr) print() if attr_found is None: print(f'attribute <{attr}> not found') continue print(f'attribute <{attr}> found') btext = attr_found.renderContents() print(f'type(btext) = {type(btext)}') print(btext) text = btext.decode('utf-8') print(f'type(text) - {type(text)}') print(text)

#/usr/bin/python3 # ========================================================== # Given a web page (file) # a. find all of the <a> tag contents # ========================================================== from bs4 import BeautifulSoup # ---- process a HTML file with open('zz3.html','r') as file: soup = BeautifulSoup(file,'html.parser') ## ---- find all of the anchors tags attrs_found = soup.find_all('a') if len(attrs_found) < 1: print('no <a> tags found') else: for attr in attrs_found: print() btext = attr.renderContents() print(f'type(btext) = {type(btext)}') print(btext) text = btext.decode('utf-8') print(f'type(text) - {type(text)}') print(text)

#/usr/bin/python3 # ========================================================== # Given a web page (file) # a. find all of the <img> tag's source file names # ========================================================== from bs4 import BeautifulSoup # ---- process a HTML file with open('zz2.html','r') as file: soup = BeautifulSoup(file,'html.parser') # ---- find all of the image tags imgs_found = soup.find_all('img') if len(imgs_found) < 1: print('no <img> tags found') else: # ---- display all image source file names for img in imgs_found: print(img['src'])

#/usr/bin/python3 # ========================================================== # Given a web page (url) # find the <title> tag's contents if any # ========================================================== from bs4 import BeautifulSoup import requests url = "https://www.tutorialspoint.com/index.htm" req = requests.get(url) soup = BeautifulSoup(req.content, "html.parser") print(soup.title)