from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
# Function to extract Product Title
def get_title(soup):
try:
# Outer Tag Object
title = soup.find("span", attrs={"id":'productTitle'})
# Inner NavigatableString Object
title_value = title.text
# Title as a string value
title_string = title_value.strip()
except AttributeError:
title_string = ""
return title_string
# Function to extract Product Price
def get_price(soup):
try:
price = soup.find("span", attrs={'id':'priceblock_ourprice'}).string.strip()
except AttributeError:
try:
# If there is some deal price
price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()
except:
price = ""
return price
# Function to extract Product Rating
def get_rating(soup):
try:
rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
except AttributeError:
try:
rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
except:
rating = ""
return rating
# Function to extract Number of User Reviews
def get_review_count(soup):
try:
review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
except AttributeError:
review_count = ""
return review_count
# Function to extract Availability Status
def get_availability(soup):
try:
available = soup.find("div", attrs={'id':'availability'})
available = available.find("span").string.strip()
except AttributeError:
available = "Not Available"
return available
if __name__ == '__main__':
# add your user agent
HEADERS = ({'User-Agent':'', 'Accept-Language': 'en-US, en;q=0.5'})
# The webpage URL
URL = "https://www.amazon.com/s?k=playstation+4&ref=nb_sb_noss_2"
# HTTP Request
webpage = requests.get(URL, headers=HEADERS)
# Soup Object containing all data
soup = BeautifulSoup(webpage.content, "html.parser")
# Fetch links as List of Tag Objects
links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
# Store the links
links_list = []
# Loop for extracting links from Tag Objects
for link in links:
links_list.append(link.get('href'))
d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}
# Loop for extracting product details from each link
for link in links_list:
new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)
new_soup = BeautifulSoup(new_webpage.content, "html.parser")
# Function calls to display all necessary product information
d['title'].append(get_title(new_soup))
d['price'].append(get_price(new_soup))
d['rating'].append(get_rating(new_soup))
d['reviews'].append(get_review_count(new_soup))
d['availability'].append(get_availability(new_soup))
amazon_df = pd.DataFrame.from_dict(d)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)
C:\Users\DELL\AppData\Local\Temp\ipykernel_11088\4254666078.py:42: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df
title | price | rating | reviews | availability | |
---|---|---|---|---|---|
0 | Sony Playstation PS4 1TB Black Console | 4.6 out of 5 stars | 1,399 ratings | In Stock | |
1 | Flagship Newest Play Station 4 1TB HDD Only on... | 4.5 out of 5 stars | 199 ratings | Not Available | |
2 | Newest Sony Playstation 4 Slim 1TB SSD Console... | 4.3 out of 5 stars | 312 ratings | Not Available | |
3 | PlayStation®5 Digital Edition (slim) | 4.7 out of 5 stars | 3,478 ratings | Only 1 left in stock - order soon. | |
4 | Charger Dock Station for PS4, 1.8 Hrs Fast Cha... | 4.6 out of 5 stars | 14,246 ratings | In Stock | |
5 | Wireless Controller Dual Vibration Game Joysti... | 4.1 out of 5 stars | 1,258 ratings | In Stock | |
6 | Wireless Controller for PS4 with 2 Thumb Grips... | 4.1 out of 5 stars | 1,602 ratings | In Stock | |
7 | VidPPluing Wireless Controller for PS4/Pro/Sli... | 4.3 out of 5 stars | 400 ratings | In Stock | |
8 | PlayStation 4 Slim 1TB Console - Marvel's Spid... | 4.7 out of 5 stars | 2,814 ratings | Not Available | |
9 | OIVO PS4 Stand Cooling Fan Station for Playsta... | 4.5 out of 5 stars | 45,109 ratings | In Stock | |
10 | PlayStation 4 Slim 500GB Console - Uncharted 4... | 4.8 out of 5 stars | 6,453 ratings | Only 1 left in stock - order soon. | |
11 | Turtle Beach Stealth 700 Gen 2 MAX Wireless Am... | 4.3 out of 5 stars | 1,621 ratings | In Stock | |
12 | OWC 2.0 TB External Hard Drive Upgrade for Son... | 4.8 out of 5 stars | 60 ratings | Only 19 left in stock - order soon. | |
13 | Minecraft Starter Collection (PS4) | 4.6 out of 5 stars | 660 ratings | Only 1 left in stock - order soon. | |
14 | Light-up Wireless Controller for PS4,Black Cra... | 4.3 out of 5 stars | 269 ratings | In Stock | |
15 | PlayStation 4 Slim 1TB Console - Black (Renewed) | 4.1 out of 5 stars | 1,513 ratings | Only 2 left in stock - order soon. | |
16 | PlayStation 4 Slim 1TB Console | 4.7 out of 5 stars | 15,672 ratings | Not Available | |
17 | The Crew Motorfest - Standard Edition, PlaySta... | 4.7 out of 5 stars | 126 ratings | In Stock | |
18 | Replacement Astro A40 A10 A30 A50 Gaming Heads... | 3.9 out of 5 stars | 12 ratings | ||
19 | Rolling Universal Gaming Backpack for Xbox One... | 3.8 out of 5 stars | 51 ratings | In Stock |