Fixed product fetcher

This commit is contained in:
Administrator 2022-05-10 14:00:41 +02:00
parent 5e3e41af78
commit 41e4fef460
4 changed files with 75 additions and 20 deletions

View File

@ -6,3 +6,5 @@ pymysql==1.0.2
requests~=2.27.1 requests~=2.27.1
beautifulsoup4~=4.11.1 beautifulsoup4~=4.11.1
pandas~=1.4.1 pandas~=1.4.1
PyVirtualDisplay~=3.0
selenium~=4.1.5

View File

@ -3,6 +3,23 @@ FROM python:3.10-slim
# Change the working directory to the root of the project # Change the working directory to the root of the project
WORKDIR /srv/flask_app WORKDIR /srv/flask_app
# Install dependencies
RUN apt update && apt install -y xvfb curl wget bzip2 libasound2 libc-bin libxtst6 packagekit-gtk3-module libx11-xcb-dev libdbus-glib-1-2 libxt6 libpci-dev && rm -rf /var/lib/apt/lists/*
# Install geckodriver
RUN GECKODRIVER_VERSION=`curl -L -s https://github.com/mozilla/geckodriver/releases/latest | grep -Po 'v[0-9]+.[0-9]+.[0-9]+' | head -1` && \
wget https://github.com/mozilla/geckodriver/releases/download/$GECKODRIVER_VERSION/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz && \
tar -zxf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
# Install firefox web browser
RUN FIREFOX_SETUP=firefox-setup.tar.bz2 && \
wget -O $FIREFOX_SETUP "https://download.mozilla.org/?product=firefox-latest&os=linux64" && \
tar xjf $FIREFOX_SETUP -C /opt/ && \
ln -s /opt/firefox/firefox /usr/bin/firefox && \
rm $FIREFOX_SETUP
# Install the dependencies # Install the dependencies
COPY requirements.txt /srv/flask_app/ COPY requirements.txt /srv/flask_app/
RUN pip install -r requirements.txt --src /usr/local/src --no-warn-script-location RUN pip install -r requirements.txt --src /usr/local/src --no-warn-script-location

View File

@ -497,7 +497,7 @@ def receive_product_data(message):
product_src = fetch_url('https://www.amazon.de/dp/' + product_id) product_src = fetch_url('https://www.amazon.de/dp/' + product_id)
title = get_title(product_src) title = get_title(product_src)
image_url = get_image(product_src, get_title(product_src)) image_url = get_image(product_src)
price = get_price(product_src) price = get_price(product_src)
description = get_description(product_src) description = get_description(product_src)

View File

@ -1,8 +1,12 @@
"""script with functions for fetching product data from amazon""" """script with functions for fetching product data from amazon"""
import json import json
import os
import time
import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver import firefox
def fetch_url(url): def fetch_url(url):
@ -17,12 +21,29 @@ def fetch_url(url):
Raises: Raises:
None: None None: None
""" """
headers = { display = Display(visible=False, size=(800, 600))
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0' display.start()
}
response = requests.get(url, headers=headers) firefox_options = firefox.options.Options()
return response.text firefox_options.set_preference('browser.download.folderList', 2)
firefox_options.set_preference(
'browser.download.manager.showWhenStarting', False
)
firefox_options.set_preference('browser.download.dir', os.getcwd())
firefox_options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'text/csv'
)
browser = webdriver.Firefox(options=firefox_options)
browser.get(url)
source = browser.page_source
browser.quit()
display.stop()
return source
def get_title(response): def get_title(response):
@ -47,12 +68,11 @@ def get_title(response):
return None return None
def get_image(response, title): def get_image(response):
"""Get image urls from response """Get image urls from response
Args: Args:
response (Text): html response from amazon response (Text): html response from amazon
title (String): title of product
Returns: Returns:
String: product image url String: product image url
@ -62,10 +82,10 @@ def get_image(response, title):
""" """
soup = BeautifulSoup(response, 'html.parser') soup = BeautifulSoup(response, 'html.parser')
images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&")) div = soup.find_all("div", attrs={"id": "imgTagWrapperId"})
if len(images) > 0: if len(div) > 0:
images = json.loads(images[0]["data-a-dynamic-image"]) images = json.loads(div[0].img["data-a-dynamic-image"])
# Find largest image # Find largest image
largest_image_url = None largest_image_url = None
@ -76,7 +96,7 @@ def get_image(response, title):
largest_image_size = images[image][0] largest_image_size = images[image][0]
return largest_image_url return largest_image_url
else:
return None return None
@ -137,9 +157,25 @@ def get_price(response):
if __name__ == "__main__": if __name__ == "__main__":
"""Main function""" """Main function"""
prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG') products = [
'B082QDB6CG',
'B07MBQPQ62',
'B07MBQPQ62',
'B09Y64QV33',
'B00F0DGRZO',
'B071J8CZP9',
'B001MF002A',
'B082QM712M',
'B091DV8SXG',
]
print("Title: " + get_title(prod_src) + "\n") for p in products:
print("Image: " + get_image(prod_src, get_title(prod_src)) + "\n") prod_src = fetch_url('https://www.amazon.de/dp/' + p)
print("-----------------------------------------------------" + p + "-----------------------------------------------------")
print("Title: " + str(get_title(prod_src)) + "\n")
print("Image: " + str(get_image(prod_src)) + "\n")
print("Price: " + str(get_price(prod_src)) + "\n") print("Price: " + str(get_price(prod_src)) + "\n")
print("Description: " + get_description(prod_src)) print("Description: " + str(get_description(prod_src)) + "\n\n")
time.sleep(2)