#/usr/bin/python3
# ==========================================================
# Given a web page (file)
# a. find the <title> tag contents if any
# b. find the first <a> tag's contents if any
# ==========================================================
from bs4 import BeautifulSoup
# ---- process a HTML file
with open('zz3.html','r') as file:
soup = BeautifulSoup(file,'html.parser')
# ---- find multiple HTML tags
for attr in ['title','a']:
attr_found = soup.find(attr)
print()
if attr_found is None:
print(f'attribute <{attr}> not found')
continue
print(f'attribute <{attr}> found')
btext = attr_found.renderContents()
print(f'type(btext) = {type(btext)}')
print(btext)
text = btext.decode('utf-8')
print(f'type(text) - {type(text)}')
print(text)
#/usr/bin/python3
# ==========================================================
# Given a web page (file)
# a. find all of the <a> tag contents
# ==========================================================
from bs4 import BeautifulSoup
# ---- process a HTML file
with open('zz3.html','r') as file:
soup = BeautifulSoup(file,'html.parser')
## ---- find all of the anchors tags
attrs_found = soup.find_all('a')
if len(attrs_found) < 1:
print('no <a> tags found')
else:
for attr in attrs_found:
print()
btext = attr.renderContents()
print(f'type(btext) = {type(btext)}')
print(btext)
text = btext.decode('utf-8')
print(f'type(text) - {type(text)}')
print(text)
#/usr/bin/python3
# ==========================================================
# Given a web page (file)
# a. find all of the <img> tag's source file names
# ==========================================================
from bs4 import BeautifulSoup
# ---- process a HTML file
with open('zz2.html','r') as file:
soup = BeautifulSoup(file,'html.parser')
# ---- find all of the image tags
imgs_found = soup.find_all('img')
if len(imgs_found) < 1:
print('no <img> tags found')
else:
# ---- display all image source file names
for img in imgs_found:
print(img['src'])
#/usr/bin/python3
# ==========================================================
# Given a web page (url)
# find the <title> tag's contents if any
# ==========================================================
from bs4 import BeautifulSoup
import requests
url = "https://www.tutorialspoint.com/index.htm"
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser")
print(soup.title)