From bdd342f8ac2a428210e0935d0ff439223768d8b4 Mon Sep 17 00:00:00 2001
From: H4CK3R-01 <resiaknairolf@gmail.com>
Date: Tue, 3 May 2022 09:52:00 +0200
Subject: [PATCH] Added fetcher to fetch title, image url, price and
 description

---
 requirements.txt  |   4 +-
 source/fetcher.py | 145 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index dde3cfb..24d040a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,6 @@ pyTelegramBotAPI~=4.5.0
 python-dotenv~=0.20.0
 APScheduler~=3.9.1
 SQLAlchemy~=1.4.36
-mysqlclient~=1.4.6
\ No newline at end of file
+mysqlclient~=1.4.6
+requests~=2.27.1
+beautifulsoup4~=4.9.1
\ No newline at end of file
diff --git a/source/fetcher.py b/source/fetcher.py
index e3ba3d6..793623e 100644
--- a/source/fetcher.py
+++ b/source/fetcher.py
@@ -1 +1,144 @@
-"""script with functions for fetching product data from amazon"""
\ No newline at end of file
+"""script with functions for fetching product data from amazon"""
+import json
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def fetch_url(url):
+    """fetch url and return response
+
+    Args:
+        url (String): url to fetch
+
+    Returns:
+        Text: html response from amazon
+
+    Raises:
+        None: None
+    """
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
+    }
+
+    response = requests.get(url, headers=headers)
+    return response.text
+
+
+def get_title(response):
+    """Get title from response
+
+    Args:
+        response (Text): html response from amazon
+
+    Returns:
+        String: title of product
+
+    Raises:
+        None: None
+    """
+    soup = BeautifulSoup(response, 'html.parser')
+
+    title = soup.find_all("meta", attrs={'name': 'title'})
+
+    if len(title) > 0:
+        return title[0]["content"].split(":")[0].strip()
+    else:
+        return None
+
+
+def get_image(response, title):
+    """Get image urls from response
+
+    Args:
+        response (Text): html response from amazon
+        title (String): title of product
+
+    Returns:
+        String: product image url
+
+    Raises:
+        None: None
+    """
+    soup = BeautifulSoup(response, 'html.parser')
+
+    images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&amp;"))
+
+    if len(images) > 0:
+        images = json.loads(images[0]["data-a-dynamic-image"])
+
+        # Find largest image
+        largest_image_url = None
+        largest_image_size = 0
+        for image in images:
+            if largest_image_url is None and largest_image_size == 0 or images[image][0] > largest_image_size:
+                largest_image_url = image
+                largest_image_size = images[image][0]
+
+        return largest_image_url
+    else:
+        return None
+
+
+def get_description(response):
+    """Get description from response
+
+    Args:
+        response (Text): html response from amazon
+
+    Returns:
+        String: product description
+
+    Raises:
+        None: None
+    """
+    soup = BeautifulSoup(response, 'html.parser')
+
+    description = soup.find_all("div", id="feature-bullets")
+
+    ret = ""
+    if len(description) > 0:
+        for item in description[0].find_all("span"):
+            if "um sicherzustellen, dass dieser Artikel passt." not in item.text and "›" not in item.text:
+                ret += item.text.strip() + "\n"
+
+        # Remove last newline and return
+        return ret[:-1]
+    else:
+        return None
+
+
+def get_price(response):
+    """Get price from response
+
+    Args:
+        response (Text): html response from amazon
+
+    Returns:
+        Array: product price and currency
+
+    Raises:
+        None: None
+    """
+    soup = BeautifulSoup(response, 'html.parser')
+
+    price = soup.find_all("div", {"class": "twister-plus-buying-options-price-data"})
+
+    if len(price) > 0:
+        j = json.loads(price[0].text)
+
+        if len(j) > 0:
+            return [j[0]["priceAmount"], j[0]["currencySymbol"]]
+        else:
+            return None
+    else:
+        return None
+
+
+if __name__ == "__main__":
+    prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG')
+
+    print("Title: " + get_title(prod_src) + "\n")
+    print("Image: " + get_image(prod_src, get_title(prod_src)) + "\n")
+    print("Price: " + str(get_price(prod_src)) + "\n")
+    print("Description: " + get_description(prod_src))