GuessThePrice/source/fetcher.py

210 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""script with functions for fetching product data from amazon"""
import json
import os
import time
from bs4 import BeautifulSoup
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.webdriver import firefox
def fetch_url(url):
"""fetch url and return response
Args:
url (String): url to fetch
Returns:
Text: html response from amazon
Raises:
None: None
Test:
What happens if url returns non 200 status code
What happens if url returns 200 status code
What happens if Firefox, Gecko or important libraries are not installed
"""
display = Display(visible=False, size=(800, 600))
display.start()
firefox_options = firefox.options.Options()
firefox_options.set_preference('browser.download.folderList', 2)
firefox_options.set_preference(
'browser.download.manager.showWhenStarting', False
)
firefox_options.set_preference('browser.download.dir', os.getcwd())
firefox_options.set_preference(
'browser.helperApps.neverAsk.saveToDisk', 'text/csv'
)
browser = webdriver.Firefox(options=firefox_options)
browser.get(url)
source = browser.page_source
browser.quit()
display.stop()
return source
def get_title(response):
"""Get title from response
Args:
response (Text): html response from amazon
Returns:
String: title of product
Raises:
None: None
Test:
Response contains meta tag with title attribute
Response doesn't contain meta tag
Response is invalid html
"""
soup = BeautifulSoup(response, 'html.parser')
title = soup.find_all("meta", attrs={'name': 'title'})
if len(title) > 0:
return title[0]["content"].split(":")[0].strip()
else:
return None
def get_image(response):
"""Get image urls from response
Args:
response (Text): html response from amazon
Returns:
String: product image url
Raises:
None: None
Test:
Response contains div with "imgTagWrapperId" id and includes json data
Response doesn't contain div
Response is invalid html
JSON is not valid
"""
soup = BeautifulSoup(response, 'html.parser')
div = soup.find_all("div", attrs={"id": "imgTagWrapperId"})
if len(div) > 0:
images = json.loads(div[0].img["data-a-dynamic-image"])
# Find largest image
largest_image_url = None
largest_image_size = 0
for image in images:
if largest_image_url is None and largest_image_size == 0 or images[image][0] > largest_image_size:
largest_image_url = image
largest_image_size = images[image][0]
return largest_image_url
return None
def get_description(response):
"""Get description from response
Args:
response (Text): html response from amazon
Returns:
String: product description
Raises:
None: None
Test:
Response contains div with "feature-bullets" id
Response doesn't contain div
Response is invalid html
"""
soup = BeautifulSoup(response, 'html.parser')
description = soup.find_all("div", id="feature-bullets")
ret = ""
if len(description) > 0:
for item in description[0].find_all("span"):
if "um sicherzustellen, dass dieser Artikel passt." not in item.text and "" not in item.text:
ret += item.text.strip() + "\n"
# Remove last newline and return
return ret[:-1]
else:
return None
def get_price(response):
"""Get price from response
Args:
response (Text): html response from amazon
Returns:
Array: product price and currency
Raises:
None: None
Test:
Response contains div with class "twister-plus-buying-options-price-data"
Response doesn't contain div
No valid JSON
Only price, currency missing
Response is invalid html
"""
soup = BeautifulSoup(response, 'html.parser')
price = soup.find_all("div", {"class": "twister-plus-buying-options-price-data"})
if len(price) > 0:
j = json.loads(price[0].text)
if len(j) > 0:
return [j[0]["priceAmount"], j[0]["currencySymbol"]]
else:
return None
else:
return None
if __name__ == "__main__":
"""Main function"""
products = [
'B082QDB6CG',
'B07MBQPQ62',
'B07MBQPQ62',
'B09Y64QV33',
'B00F0DGRZO',
'B071J8CZP9',
'B001MF002A',
'B082QM712M',
'B091DV8SXG',
]
for p in products:
prod_src = fetch_url('https://www.amazon.de/dp/' + p)
print("-----------------------------------------------------" + p + "-----------------------------------------------------")
print("Title: " + str(get_title(prod_src)) + "\n")
print("Image: " + str(get_image(prod_src)) + "\n")
print("Price: " + str(get_price(prod_src)) + "\n")
print("Description: " + str(get_description(prod_src)) + "\n\n")
time.sleep(2)