Fixed product fetcher

This commit is contained in:
Administrator 2022-05-10 14:00:41 +02:00
parent 5e3e41af78
commit 41e4fef460
4 changed files with 75 additions and 20 deletions

View File

@ -6,3 +6,5 @@ pymysql==1.0.2
requests~=2.27.1
beautifulsoup4~=4.11.1
pandas~=1.4.1
PyVirtualDisplay~=3.0
selenium~=4.1.5

View File

@ -3,6 +3,23 @@ FROM python:3.10-slim
# Change the working directory to the root of the project
WORKDIR /srv/flask_app
# Install dependencies
RUN apt update && apt install -y xvfb curl wget bzip2 libasound2 libc-bin libxtst6 packagekit-gtk3-module libx11-xcb-dev libdbus-glib-1-2 libxt6 libpci-dev && rm -rf /var/lib/apt/lists/*
# Install geckodriver
RUN GECKODRIVER_VERSION=`curl -L -s https://github.com/mozilla/geckodriver/releases/latest | grep -Po 'v[0-9]+.[0-9]+.[0-9]+' | head -1` && \
wget https://github.com/mozilla/geckodriver/releases/download/$GECKODRIVER_VERSION/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz && \
tar -zxf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz -C /usr/local/bin && \
chmod +x /usr/local/bin/geckodriver && \
rm geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
# Install firefox web browser
RUN FIREFOX_SETUP=firefox-setup.tar.bz2 && \
wget -O $FIREFOX_SETUP "https://download.mozilla.org/?product=firefox-latest&os=linux64" && \
tar xjf $FIREFOX_SETUP -C /opt/ && \
ln -s /opt/firefox/firefox /usr/bin/firefox && \
rm $FIREFOX_SETUP
# Install the dependencies
COPY requirements.txt /srv/flask_app/
RUN pip install -r requirements.txt --src /usr/local/src --no-warn-script-location

View File

@ -497,7 +497,7 @@ def receive_product_data(message):
product_src = fetch_url('https://www.amazon.de/dp/' + product_id)
title = get_title(product_src)
image_url = get_image(product_src, get_title(product_src))
image_url = get_image(product_src)
price = get_price(product_src)
description = get_description(product_src)

View File

@ -1,8 +1,12 @@
"""script with functions for fetching product data from amazon"""
import json
import os
import time
import requests
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver import firefox
def fetch_url(url):
@ -17,12 +21,29 @@ def fetch_url(url):
Raises:
None: None
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
}
display = Display(visible=False, size=(800, 600))
display.start()
response = requests.get(url, headers=headers)
return response.text
firefox_options = firefox.options.Options()
firefox_options.set_preference('browser.download.folderList', 2)
firefox_options.set_preference(
'browser.download.manager.showWhenStarting', False
)
firefox_options.set_preference('browser.download.dir', os.getcwd())
firefox_options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'text/csv'
)
browser = webdriver.Firefox(options=firefox_options)
browser.get(url)
source = browser.page_source
browser.quit()
display.stop()
return source
def get_title(response):
@ -47,12 +68,11 @@ def get_title(response):
return None
def get_image(response, title):
def get_image(response):
"""Get image urls from response
Args:
response (Text): html response from amazon
title (String): title of product
Returns:
String: product image url
@ -62,10 +82,10 @@ def get_image(response, title):
"""
soup = BeautifulSoup(response, 'html.parser')
images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&"))
div = soup.find_all("div", attrs={"id": "imgTagWrapperId"})
if len(images) > 0:
images = json.loads(images[0]["data-a-dynamic-image"])
if len(div) > 0:
images = json.loads(div[0].img["data-a-dynamic-image"])
# Find largest image
largest_image_url = None
@ -76,8 +96,8 @@ def get_image(response, title):
largest_image_size = images[image][0]
return largest_image_url
else:
return None
return None
def get_description(response):
@ -137,9 +157,25 @@ def get_price(response):
if __name__ == "__main__":
"""Main function"""
prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG')
products = [
'B082QDB6CG',
'B07MBQPQ62',
'B07MBQPQ62',
'B09Y64QV33',
'B00F0DGRZO',
'B071J8CZP9',
'B001MF002A',
'B082QM712M',
'B091DV8SXG',
]
print("Title: " + get_title(prod_src) + "\n")
print("Image: " + get_image(prod_src, get_title(prod_src)) + "\n")
print("Price: " + str(get_price(prod_src)) + "\n")
print("Description: " + get_description(prod_src))
for p in products:
prod_src = fetch_url('https://www.amazon.de/dp/' + p)
print("-----------------------------------------------------" + p + "-----------------------------------------------------")
print("Title: " + str(get_title(prod_src)) + "\n")
print("Image: " + str(get_image(prod_src)) + "\n")
print("Price: " + str(get_price(prod_src)) + "\n")
print("Description: " + str(get_description(prod_src)) + "\n\n")
time.sleep(2)