Fixed product fetcher

2022-05-10 14:00:41 +02:00
parent 5e3e41af78
commit 41e4fef460
4 changed files with 75 additions and 20 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,5 @@ pymysql==1.0.2
 requests~=2.27.1
 beautifulsoup4~=4.11.1
 pandas~=1.4.1
 PyVirtualDisplay~=3.0
 selenium~=4.1.5
--- a/source/Dockerfile
+++ b/source/Dockerfile
@@ -3,6 +3,23 @@ FROM python:3.10-slim
 # Change the working directory to the root of the project
 WORKDIR /srv/flask_app
 # Install dependencies
 RUN apt update && apt install -y xvfb curl wget bzip2 libasound2 libc-bin libxtst6 packagekit-gtk3-module libx11-xcb-dev libdbus-glib-1-2 libxt6 libpci-dev && rm -rf /var/lib/apt/lists/*
 # Install geckodriver
 RUN GECKODRIVER_VERSION=`curl -L -s https://github.com/mozilla/geckodriver/releases/latest | grep -Po 'v[0-9]+.[0-9]+.[0-9]+' | head -1` && \
 wget https://github.com/mozilla/geckodriver/releases/download/$GECKODRIVER_VERSION/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz && \
 tar -zxf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz -C /usr/local/bin && \
 chmod +x /usr/local/bin/geckodriver && \
 rm geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
 # Install firefox web browser
 RUN FIREFOX_SETUP=firefox-setup.tar.bz2 && \
    wget -O $FIREFOX_SETUP "https://download.mozilla.org/?product=firefox-latest&os=linux64" && \
    tar xjf $FIREFOX_SETUP -C /opt/ && \
    ln -s /opt/firefox/firefox /usr/bin/firefox && \
    rm $FIREFOX_SETUP
 # Install the dependencies
 COPY requirements.txt /srv/flask_app/
 RUN pip install -r requirements.txt --src /usr/local/src --no-warn-script-location
--- a/source/bot.py
+++ b/source/bot.py
@@ -497,7 +497,7 @@ def receive_product_data(message):
    product_src = fetch_url('https://www.amazon.de/dp/' + product_id)
    title = get_title(product_src)
-    image_url = get_image(product_src, get_title(product_src))
+    image_url = get_image(product_src)
    price = get_price(product_src)
    description = get_description(product_src)
--- a/source/fetcher.py
+++ b/source/fetcher.py
@@ -1,8 +1,12 @@
 """script with functions for fetching product data from amazon"""
 import json
 import os
 import time
 import requests
 from bs4 import BeautifulSoup
 from pyvirtualdisplay import Display
 from selenium import webdriver
 from selenium.webdriver import firefox
 def fetch_url(url):
@@ -17,12 +21,29 @@ def fetch_url(url):
    Raises:
        None: None
    """
-    headers = {
+    display = Display(visible=False, size=(800, 600))
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
+    display.start()
    }
-    response = requests.get(url, headers=headers)
+    firefox_options = firefox.options.Options()
-    return response.text
+    firefox_options.set_preference('browser.download.folderList', 2)
    firefox_options.set_preference(
        'browser.download.manager.showWhenStarting', False
    )
    firefox_options.set_preference('browser.download.dir', os.getcwd())
    firefox_options.set_preference(
        'browser.helperApps.neverAsk.saveToDisk', 'text/csv'
    )
    browser = webdriver.Firefox(options=firefox_options)
    browser.get(url)
    source = browser.page_source
    browser.quit()
    display.stop()
    return source
 def get_title(response):
@@ -47,12 +68,11 @@ def get_title(response):
        return None
-def get_image(response, title):
+def get_image(response):
    """Get image urls from response
    Args:
        response (Text): html response from amazon
        title (String): title of product
    Returns:
        String: product image url
@@ -62,10 +82,10 @@ def get_image(response, title):
    """
    soup = BeautifulSoup(response, 'html.parser')
-    images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&amp;"))
+    div = soup.find_all("div", attrs={"id": "imgTagWrapperId"})
-    if len(images) > 0:
+    if len(div) > 0:
-        images = json.loads(images[0]["data-a-dynamic-image"])
+        images = json.loads(div[0].img["data-a-dynamic-image"])
        # Find largest image
        largest_image_url = None
@@ -76,7 +96,7 @@ def get_image(response, title):
                largest_image_size = images[image][0]
        return largest_image_url
-    else:
+
    return None
@@ -137,9 +157,25 @@ def get_price(response):
 if __name__ == "__main__":
    """Main function"""
-    prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG')
+    products = [
        'B082QDB6CG',
        'B07MBQPQ62',
        'B07MBQPQ62',
        'B09Y64QV33',
        'B00F0DGRZO',
        'B071J8CZP9',
        'B001MF002A',
        'B082QM712M',
        'B091DV8SXG',
    ]
-    print("Title:       " + get_title(prod_src) + "\n")
+    for p in products:
-    print("Image:       " + get_image(prod_src, get_title(prod_src)) + "\n")
+        prod_src = fetch_url('https://www.amazon.de/dp/' + p)
        print("-----------------------------------------------------" + p + "-----------------------------------------------------")
        print("Title:       " + str(get_title(prod_src)) + "\n")
        print("Image:       " + str(get_image(prod_src)) + "\n")
        print("Price:       " + str(get_price(prod_src)) + "\n")
-    print("Description: " + get_description(prod_src))
+        print("Description: " + str(get_description(prod_src)) + "\n\n")
        time.sleep(2)