Added fetcher to fetch title, image url, price and description
This commit is contained in:
parent
7bd48745c0
commit
bdd342f8ac
@ -3,3 +3,5 @@ python-dotenv~=0.20.0
|
|||||||
APScheduler~=3.9.1
|
APScheduler~=3.9.1
|
||||||
SQLAlchemy~=1.4.36
|
SQLAlchemy~=1.4.36
|
||||||
mysqlclient~=1.4.6
|
mysqlclient~=1.4.6
|
||||||
|
requests~=2.27.1
|
||||||
|
beautifulsoup4~=4.9.1
|
@ -1 +1,144 @@
|
|||||||
"""script with functions for fetching product data from amazon"""
|
"""script with functions for fetching product data from amazon"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url(url):
|
||||||
|
"""fetch url and return response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (String): url to fetch
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Text: html response from amazon
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
None: None
|
||||||
|
"""
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
return response.text
|
||||||
|
|
||||||
|
|
||||||
|
def get_title(response):
|
||||||
|
"""Get title from response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response (Text): html response from amazon
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String: title of product
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
None: None
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(response, 'html.parser')
|
||||||
|
|
||||||
|
title = soup.find_all("meta", attrs={'name': 'title'})
|
||||||
|
|
||||||
|
if len(title) > 0:
|
||||||
|
return title[0]["content"].split(":")[0].strip()
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_image(response, title):
|
||||||
|
"""Get image urls from response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response (Text): html response from amazon
|
||||||
|
title (String): title of product
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String: product image url
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
None: None
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(response, 'html.parser')
|
||||||
|
|
||||||
|
images = soup.find_all("img", alt=lambda a: a == title.replace("&", "&"))
|
||||||
|
|
||||||
|
if len(images) > 0:
|
||||||
|
images = json.loads(images[0]["data-a-dynamic-image"])
|
||||||
|
|
||||||
|
# Find largest image
|
||||||
|
largest_image_url = None
|
||||||
|
largest_image_size = 0
|
||||||
|
for image in images:
|
||||||
|
if largest_image_url is None and largest_image_size == 0 or images[image][0] > largest_image_size:
|
||||||
|
largest_image_url = image
|
||||||
|
largest_image_size = images[image][0]
|
||||||
|
|
||||||
|
return largest_image_url
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_description(response):
|
||||||
|
"""Get description from response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response (Text): html response from amazon
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
String: product description
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
None: None
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(response, 'html.parser')
|
||||||
|
|
||||||
|
description = soup.find_all("div", id="feature-bullets")
|
||||||
|
|
||||||
|
ret = ""
|
||||||
|
if len(description) > 0:
|
||||||
|
for item in description[0].find_all("span"):
|
||||||
|
if "um sicherzustellen, dass dieser Artikel passt." not in item.text and "›" not in item.text:
|
||||||
|
ret += item.text.strip() + "\n"
|
||||||
|
|
||||||
|
# Remove last newline and return
|
||||||
|
return ret[:-1]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_price(response):
|
||||||
|
"""Get price from response
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response (Text): html response from amazon
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Array: product price and currency
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
None: None
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(response, 'html.parser')
|
||||||
|
|
||||||
|
price = soup.find_all("div", {"class": "twister-plus-buying-options-price-data"})
|
||||||
|
|
||||||
|
if len(price) > 0:
|
||||||
|
j = json.loads(price[0].text)
|
||||||
|
|
||||||
|
if len(j) > 0:
|
||||||
|
return [j[0]["priceAmount"], j[0]["currencySymbol"]]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
prod_src = fetch_url('https://www.amazon.de/dp/B082QDB6CG')
|
||||||
|
|
||||||
|
print("Title: " + get_title(prod_src) + "\n")
|
||||||
|
print("Image: " + get_image(prod_src, get_title(prod_src)) + "\n")
|
||||||
|
print("Price: " + str(get_price(prod_src)) + "\n")
|
||||||
|
print("Description: " + get_description(prod_src))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user