From bdd342f8ac2a428210e0935d0ff439223768d8b4 Mon Sep 17 00:00:00 2001 From: H4CK3R-01 Date: Tue, 3 May 2022 09:52:00 +0200 Subject: [PATCH] Added fetcher to fetch title, image url, price and description --- requirements.txt | 4 +- source/fetcher.py | 145 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 147 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index dde3cfb..24d040a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ pyTelegramBotAPI~=4.5.0 python-dotenv~=0.20.0 APScheduler~=3.9.1 SQLAlchemy~=1.4.36 -mysqlclient~=1.4.6 \ No newline at end of file +mysqlclient~=1.4.6 +requests~=2.27.1 +beautifulsoup4~=4.9.1 \ No newline at end of file diff --git a/source/fetcher.py b/source/fetcher.py index e3ba3d6..793623e 100644 --- a/source/fetcher.py +++ b/source/fetcher.py @@ -1 +1,144 @@ -"""script with functions for fetching product data from amazon""" \ No newline at end of file +"""script with functions for fetching product data from amazon""" +import json + +import requests +from bs4 import BeautifulSoup + + +def fetch_url(url): + """fetch url and return response + + Args: + url (String): url to fetch + + Returns: + Text: html response from amazon + + Raises: + None: None + """ + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' + } + + response = requests.get(url, headers=headers) + return response.text + + +def get_title(response): + """Get title from response + + Args: + response (Text): html response from amazon + + Returns: + String: title of product + + Raises: + None: None + """ + soup = BeautifulSoup(response, 'html.parser') + + title = soup.find_all("meta", attrs={'name': 'title'}) + + if len(title) > 0: + return title[0]["content"].split(":")[0].strip() + else: + return None + + +def get_image(response, title): + """Get image urls from response + + Args: + response (Text): html response from amazon + title (String): title of product + + Returns: + String: product image url + + Raises: + None: None + """ + soup = BeautifulSoup(response, 'html.parser') + + images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&")) + + if len(images) > 0: + images = json.loads(images[0]["data-a-dynamic-image"]) + + # Find largest image + largest_image_url = None + largest_image_size = 0 + for image in images: + if largest_image_url is None and largest_image_size == 0 or images[image][0] > largest_image_size: + largest_image_url = image + largest_image_size = images[image][0] + + return largest_image_url + else: + return None + + +def get_description(response): + """Get description from response + + Args: + response (Text): html response from amazon + + Returns: + String: product description + + Raises: + None: None + """ + soup = BeautifulSoup(response, 'html.parser') + + description = soup.find_all("div", id="feature-bullets") + + ret = "" + if len(description) > 0: + for item in description[0].find_all("span"): + if "um sicherzustellen, dass dieser Artikel passt." not in item.text and "›" not in item.text: + ret += item.text.strip() + "\n" + + # Remove last newline and return + return ret[:-1] + else: + return None + + +def get_price(response): + """Get price from response + + Args: + response (Text): html response from amazon + + Returns: + Array: product price and currency + + Raises: + None: None + """ + soup = BeautifulSoup(response, 'html.parser') + + price = soup.find_all("div", {"class": "twister-plus-buying-options-price-data"}) + + if len(price) > 0: + j = json.loads(price[0].text) + + if len(j) > 0: + return [j[0]["priceAmount"], j[0]["currencySymbol"]] + else: + return None + else: + return None + + +if __name__ == "__main__": + prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG') + + print("Title: " + get_title(prod_src) + "\n") + print("Image: " + get_image(prod_src, get_title(prod_src)) + "\n") + print("Price: " + str(get_price(prod_src)) + "\n") + print("Description: " + get_description(prod_src))