From d43096f34659ba74c8a4c16ecf2792ad0ccdf192 Mon Sep 17 00:00:00 2001 From: jonas Date: Sat, 9 Jan 2021 22:05:16 +0000 Subject: [PATCH] Initial commit --- birdscrape.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 birdscrape.py diff --git a/birdscrape.py b/birdscrape.py new file mode 100644 index 0000000..e9b0bc0 --- /dev/null +++ b/birdscrape.py @@ -0,0 +1,56 @@ +import requests +import shutil + +from bs4 import BeautifulSoup + +def downloadImage(url, filename): + # Open the url image, set stream to True, this will return the stream content. + r = requests.get(url, stream = True) + + # Check if the image was retrieved successfully + if r.status_code == 200: + # Set decode_content value to True, otherwise the downloaded image file's size will be zero. + r.raw.decode_content = True + + # Open a local file with wb ( write binary ) permission. + with open(filename,'wb') as f: + shutil.copyfileobj(r.raw, f) + + print('Image sucessfully Downloaded: ',filename) + else: + print('Image Couldn\'t be retreived') + +URL = "http://tirill.de/illustrierte-voegel-liste.html" +html_text = requests.get(URL) +soup = BeautifulSoup(html_text.text, 'html.parser') + +# Dictionary for all bird URLs +bird_dict_urls = {"URL":[]} + +birdlist = soup.find('ul', class_ = "level_1") + +print("Getting list of all illustrated birds on: " + URL) + +for bird in birdlist.findAll('li'): + bird_url = "http://tirill.de/" + bird.a['href'] + #print(bird_url) + bird_dict_urls["URL"].append(bird_url) + +print(str(len(bird_dict_urls["URL"])) + " Vochels gefunden.") + +for i in bird_dict_urls["URL"]: + ## SING BIRD INFO + DOWNLOAD + html_bird = requests.get(i) + bird_soup = BeautifulSoup(html_bird.text, 'html.parser') + + bird_images_container = bird_soup.find_all('figure', class_ = "image_container") + #print(bird_images_container) + + for image in bird_images_container: + #print(image.img['alt']) + #print(image.img['src']) + full_path_img = "https://tirill.de/" + image.img['src'] + filename = full_path_img.split("/")[-1] + print(full_path_img) + print(filename) + downloadImage(full_path_img, filename) \ No newline at end of file