Files
birdscrape/birdscrape.py
2021-01-09 22:05:16 +00:00

56 lines
1.8 KiB
Python

import requests
import shutil
from bs4 import BeautifulSoup
def downloadImage(url, filename):
# Open the url image, set stream to True, this will return the stream content.
r = requests.get(url, stream = True)
# Check if the image was retrieved successfully
if r.status_code == 200:
# Set decode_content value to True, otherwise the downloaded image file's size will be zero.
r.raw.decode_content = True
# Open a local file with wb ( write binary ) permission.
with open(filename,'wb') as f:
shutil.copyfileobj(r.raw, f)
print('Image sucessfully Downloaded: ',filename)
else:
print('Image Couldn\'t be retreived')
URL = "http://tirill.de/illustrierte-voegel-liste.html"
html_text = requests.get(URL)
soup = BeautifulSoup(html_text.text, 'html.parser')
# Dictionary for all bird URLs
bird_dict_urls = {"URL":[]}
birdlist = soup.find('ul', class_ = "level_1")
print("Getting list of all illustrated birds on: " + URL)
for bird in birdlist.findAll('li'):
bird_url = "http://tirill.de/" + bird.a['href']
#print(bird_url)
bird_dict_urls["URL"].append(bird_url)
print(str(len(bird_dict_urls["URL"])) + " Vochels gefunden.")
for i in bird_dict_urls["URL"]:
## SING BIRD INFO + DOWNLOAD
html_bird = requests.get(i)
bird_soup = BeautifulSoup(html_bird.text, 'html.parser')
bird_images_container = bird_soup.find_all('figure', class_ = "image_container")
#print(bird_images_container)
for image in bird_images_container:
#print(image.img['alt'])
#print(image.img['src'])
full_path_img = "https://tirill.de/" + image.img['src']
filename = full_path_img.split("/")[-1]
print(full_path_img)
print(filename)
downloadImage(full_path_img, filename)