Fixed product fetcher
This commit is contained in:
parent
5e3e41af78
commit
41e4fef460
@ -6,3 +6,5 @@ pymysql==1.0.2
|
|||||||
requests~=2.27.1
|
requests~=2.27.1
|
||||||
beautifulsoup4~=4.11.1
|
beautifulsoup4~=4.11.1
|
||||||
pandas~=1.4.1
|
pandas~=1.4.1
|
||||||
|
PyVirtualDisplay~=3.0
|
||||||
|
selenium~=4.1.5
|
@ -3,6 +3,23 @@ FROM python:3.10-slim
|
|||||||
# Change the working directory to the root of the project
|
# Change the working directory to the root of the project
|
||||||
WORKDIR /srv/flask_app
|
WORKDIR /srv/flask_app
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN apt update && apt install -y xvfb curl wget bzip2 libasound2 libc-bin libxtst6 packagekit-gtk3-module libx11-xcb-dev libdbus-glib-1-2 libxt6 libpci-dev && rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install geckodriver
|
||||||
|
RUN GECKODRIVER_VERSION=`curl -L -s https://github.com/mozilla/geckodriver/releases/latest | grep -Po 'v[0-9]+.[0-9]+.[0-9]+' | head -1` && \
|
||||||
|
wget https://github.com/mozilla/geckodriver/releases/download/$GECKODRIVER_VERSION/geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz && \
|
||||||
|
tar -zxf geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz -C /usr/local/bin && \
|
||||||
|
chmod +x /usr/local/bin/geckodriver && \
|
||||||
|
rm geckodriver-$GECKODRIVER_VERSION-linux64.tar.gz
|
||||||
|
|
||||||
|
# Install firefox web browser
|
||||||
|
RUN FIREFOX_SETUP=firefox-setup.tar.bz2 && \
|
||||||
|
wget -O $FIREFOX_SETUP "https://download.mozilla.org/?product=firefox-latest&os=linux64" && \
|
||||||
|
tar xjf $FIREFOX_SETUP -C /opt/ && \
|
||||||
|
ln -s /opt/firefox/firefox /usr/bin/firefox && \
|
||||||
|
rm $FIREFOX_SETUP
|
||||||
|
|
||||||
# Install the dependencies
|
# Install the dependencies
|
||||||
COPY requirements.txt /srv/flask_app/
|
COPY requirements.txt /srv/flask_app/
|
||||||
RUN pip install -r requirements.txt --src /usr/local/src --no-warn-script-location
|
RUN pip install -r requirements.txt --src /usr/local/src --no-warn-script-location
|
||||||
|
@ -497,7 +497,7 @@ def receive_product_data(message):
|
|||||||
product_src = fetch_url('https://www.amazon.de/dp/' + product_id)
|
product_src = fetch_url('https://www.amazon.de/dp/' + product_id)
|
||||||
|
|
||||||
title = get_title(product_src)
|
title = get_title(product_src)
|
||||||
image_url = get_image(product_src, get_title(product_src))
|
image_url = get_image(product_src)
|
||||||
price = get_price(product_src)
|
price = get_price(product_src)
|
||||||
description = get_description(product_src)
|
description = get_description(product_src)
|
||||||
|
|
||||||
|
@ -1,8 +1,12 @@
|
|||||||
"""script with functions for fetching product data from amazon"""
|
"""script with functions for fetching product data from amazon"""
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver import firefox
|
||||||
|
|
||||||
|
|
||||||
def fetch_url(url):
|
def fetch_url(url):
|
||||||
@ -17,12 +21,29 @@ def fetch_url(url):
|
|||||||
Raises:
|
Raises:
|
||||||
None: None
|
None: None
|
||||||
"""
|
"""
|
||||||
headers = {
|
display = Display(visible=False, size=(800, 600))
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
|
display.start()
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(url, headers=headers)
|
firefox_options = firefox.options.Options()
|
||||||
return response.text
|
firefox_options.set_preference('browser.download.folderList', 2)
|
||||||
|
firefox_options.set_preference(
|
||||||
|
'browser.download.manager.showWhenStarting', False
|
||||||
|
)
|
||||||
|
firefox_options.set_preference('browser.download.dir', os.getcwd())
|
||||||
|
firefox_options.set_preference(
|
||||||
|
'browser.helperApps.neverAsk.saveToDisk', 'text/csv'
|
||||||
|
)
|
||||||
|
|
||||||
|
browser = webdriver.Firefox(options=firefox_options)
|
||||||
|
|
||||||
|
browser.get(url)
|
||||||
|
|
||||||
|
source = browser.page_source
|
||||||
|
|
||||||
|
browser.quit()
|
||||||
|
display.stop()
|
||||||
|
|
||||||
|
return source
|
||||||
|
|
||||||
|
|
||||||
def get_title(response):
|
def get_title(response):
|
||||||
@ -47,12 +68,11 @@ def get_title(response):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_image(response, title):
|
def get_image(response):
|
||||||
"""Get image urls from response
|
"""Get image urls from response
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
response (Text): html response from amazon
|
response (Text): html response from amazon
|
||||||
title (String): title of product
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
String: product image url
|
String: product image url
|
||||||
@ -62,10 +82,10 @@ def get_image(response, title):
|
|||||||
"""
|
"""
|
||||||
soup = BeautifulSoup(response, 'html.parser')
|
soup = BeautifulSoup(response, 'html.parser')
|
||||||
|
|
||||||
images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&"))
|
div = soup.find_all("div", attrs={"id": "imgTagWrapperId"})
|
||||||
|
|
||||||
if len(images) > 0:
|
if len(div) > 0:
|
||||||
images = json.loads(images[0]["data-a-dynamic-image"])
|
images = json.loads(div[0].img["data-a-dynamic-image"])
|
||||||
|
|
||||||
# Find largest image
|
# Find largest image
|
||||||
largest_image_url = None
|
largest_image_url = None
|
||||||
@ -76,7 +96,7 @@ def get_image(response, title):
|
|||||||
largest_image_size = images[image][0]
|
largest_image_size = images[image][0]
|
||||||
|
|
||||||
return largest_image_url
|
return largest_image_url
|
||||||
else:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -137,9 +157,25 @@ def get_price(response):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
"""Main function"""
|
"""Main function"""
|
||||||
prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG')
|
products = [
|
||||||
|
'B082QDB6CG',
|
||||||
|
'B07MBQPQ62',
|
||||||
|
'B07MBQPQ62',
|
||||||
|
'B09Y64QV33',
|
||||||
|
'B00F0DGRZO',
|
||||||
|
'B071J8CZP9',
|
||||||
|
'B001MF002A',
|
||||||
|
'B082QM712M',
|
||||||
|
'B091DV8SXG',
|
||||||
|
]
|
||||||
|
|
||||||
print("Title: " + get_title(prod_src) + "\n")
|
for p in products:
|
||||||
print("Image: " + get_image(prod_src, get_title(prod_src)) + "\n")
|
prod_src = fetch_url('https://www.amazon.de/dp/' + p)
|
||||||
|
|
||||||
|
print("-----------------------------------------------------" + p + "-----------------------------------------------------")
|
||||||
|
print("Title: " + str(get_title(prod_src)) + "\n")
|
||||||
|
print("Image: " + str(get_image(prod_src)) + "\n")
|
||||||
print("Price: " + str(get_price(prod_src)) + "\n")
|
print("Price: " + str(get_price(prod_src)) + "\n")
|
||||||
print("Description: " + get_description(prod_src))
|
print("Description: " + str(get_description(prod_src)) + "\n\n")
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
Loading…
Reference in New Issue
Block a user