"""script with functions for fetching product data from amazon""" import json import os import time from bs4 import BeautifulSoup from pyvirtualdisplay import Display from selenium import webdriver from selenium.webdriver import firefox def fetch_url(url): """fetch url and return response Args: url (String): url to fetch Returns: Text: html response from amazon Raises: None: None Test: What happens if url returns non 200 status code What happens if url returns 200 status code What happens if Firefox, Gecko or important libraries are not installed """ display = Display(visible=False, size=(800, 600)) display.start() firefox_options = firefox.options.Options() firefox_options.set_preference('browser.download.folderList', 2) firefox_options.set_preference( 'browser.download.manager.showWhenStarting', False ) firefox_options.set_preference('browser.download.dir', os.getcwd()) firefox_options.set_preference( 'browser.helperApps.neverAsk.saveToDisk', 'text/csv' ) browser = webdriver.Firefox(options=firefox_options) browser.get(url) source = browser.page_source browser.quit() display.stop() return source def get_title(response): """Get title from response Args: response (Text): html response from amazon Returns: String: title of product Raises: None: None Test: Response contains meta tag with title attribute Response doesn't contain meta tag Response is invalid html """ soup = BeautifulSoup(response, 'html.parser') title = soup.find_all("meta", attrs={'name': 'title'}) if len(title) > 0: return title[0]["content"].split(":")[0].strip() else: return None def get_image(response): """Get image urls from response Args: response (Text): html response from amazon Returns: String: product image url Raises: None: None Test: Response contains div with "imgTagWrapperId" id and includes json data Response doesn't contain div Response is invalid html JSON is not valid """ soup = BeautifulSoup(response, 'html.parser') div = soup.find_all("div", attrs={"id": "imgTagWrapperId"}) if len(div) > 0: images = json.loads(div[0].img["data-a-dynamic-image"]) # Find largest image largest_image_url = None largest_image_size = 0 for image in images: if largest_image_url is None and largest_image_size == 0 or images[image][0] > largest_image_size: largest_image_url = image largest_image_size = images[image][0] return largest_image_url return None def get_description(response): """Get description from response Args: response (Text): html response from amazon Returns: String: product description Raises: None: None Test: Response contains div with "feature-bullets" id Response doesn't contain div Response is invalid html """ soup = BeautifulSoup(response, 'html.parser') description = soup.find_all("div", id="feature-bullets") ret = "" if len(description) > 0: for item in description[0].find_all("span"): if "um sicherzustellen, dass dieser Artikel passt." not in item.text and "›" not in item.text: ret += item.text.strip() + "\n" # Remove last newline and return return ret[:-1] else: return None def get_price(response): """Get price from response Args: response (Text): html response from amazon Returns: Array: product price and currency Raises: None: None Test: Response contains div with class "twister-plus-buying-options-price-data" Response doesn't contain div No valid JSON Only price, currency missing Response is invalid html """ soup = BeautifulSoup(response, 'html.parser') price = soup.find_all("div", {"class": "twister-plus-buying-options-price-data"}) if len(price) > 0: j = json.loads(price[0].text) if len(j) > 0: return [j[0]["priceAmount"], j[0]["currencySymbol"]] else: return None else: return None if __name__ == "__main__": """Main function""" products = [ 'B082QDB6CG', 'B07MBQPQ62', 'B07MBQPQ62', 'B09Y64QV33', 'B00F0DGRZO', 'B071J8CZP9', 'B001MF002A', 'B082QM712M', 'B091DV8SXG', ] for p in products: prod_src = fetch_url('https://www.amazon.de/dp/' + p) print("-----------------------------------------------------" + p + "-----------------------------------------------------") print("Title: " + str(get_title(prod_src)) + "\n") print("Image: " + str(get_image(prod_src)) + "\n") print("Price: " + str(get_price(prod_src)) + "\n") print("Description: " + str(get_description(prod_src)) + "\n\n") time.sleep(2)