Skip to content Skip to sidebar Skip to footer

I Scraped Title And Price And Links And Info Table I Name It Planet_data And When I Write Csv File I Get Duplicated Planet_data

I want to remove duplicate planet_data import requests import csv from bs4 import BeautifulSoup requests.packages.urllib3.disable_warnings() import pandas as pd url = 'https://ww

Solution 1:

Remove while loop and inner for-loop and initialize data list outside for-loop, below code will scrape the first page of product details.

Ex.

import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
requests.packages.urllib3.disable_warnings()

url = 'https://www.paraibainternational.com/collections/gemstone?view=list'
session = requests.Session()
session.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}

content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
posts = soup.find_all('div',{'class':'product-details'})

data = []

for url in posts:
    planet_data = dict()

    title = url.find('h2').text.strip()
    price = url.find('span',{'money'}).text.strip()
    link = url.find('form').find('a',href=True).get('href')

    urls = ('https://www.paraibainternational.com/'+ link)
    url_response = requests.get(urls)

    url_data = url_response.text
    url_soup = BeautifulSoup(url_data, 'html.parser')

    desciption = url_soup.find('div',{'class':'rte main-product-description-product'})
    values = [ td.text for td in desciption.find_all('li')]

    planet_data['Weight'] = desciption.find_all('li')[1].text.strip()
    planet_data['Shape'] = desciption.find_all('li')[2].text.strip()
    planet_data['Dimensions'] = desciption.find_all('li')[3].text.strip()
    planet_data['Color'] = desciption.find_all('li')[4].text.strip()
    planet_data['Clarity'] = desciption.find_all('li')[5].text.strip()
    planet_data['Cutting'] = desciption.find_all('li')[6].text.strip()
    planet_data['Treatment'] = desciption.find_all('li')[7].text.strip()
    planet_data['Origin'] = desciption.find_all('li')[8].text.strip()
    planet_data['Hardness'] = desciption.find_all('li')[6].text.strip()
    planet_data['Price Per Carat'] = desciption.find_all('li')[10].text.strip()
    planet_data['title'] = title
    planet_data['price'] = price
    planet_data['link'] = link
    data.append(planet_data)

print(data)

O/P:

[{'Weight': 'Weight (Carats): 3.14', 'Shape': 'Shape: Cushion', 'Dimensions': 'Dimensions (L x W x D) (mm): 8.61 x 8.44 x 6.28', 'Color': 'Color: Neon Blue', 'Clarity': 'Clarity: SI', 'Cutting': 'Cutting: Excellent', 'Treatment': 'Treatment:\xa0Heat', 'Origin': 'Origin: Brazil', 'Hardness': 'Cutting: Excellent', 'Price Per Carat': 'Price Per Carat: $60,000', 'title': 'Paraiba Tourmaline Brazil 3.14 Carats', 'price': '$188,400.00', 'link': '/collections/gemstone/products/paraiba-tourmaline-3-14-carats'}, {'Weight': 'Weight (Carats): 2.78', 'Shape': 'Shape: Round', 'Dimensions': 'Dimensions (L x W x D) (mm): 8.0 x 8.0 x 5.3', 'Color': 'Color: Pink', 'Clarity': 'Clarity: IF', 'Cutting': 'Cutting: Excellent', 'Treatment': 'Treatment:\xa0Heat', 'Origin': 'Origin:\xa0Africa', 'Hardness': 'Cutting: Excellent', 'Price Per Carat': 'Price Per Carat: $80', 'title': 'Pink Tourmaline 2.78 Carats', 'price': '$222.40', 'link': '/collections/gemstone/products/pink-tourmaline-2-78-carats-round'}, {'Weight': 'Weight (Carats): 2.78', 'Shape': 'Shape: Oval', 'Dimensions': 'Dimensions (L x W x D) (mm): 9.8 x 8.9 x 5.7', 'Color': 'Color: Intense Pink', 'Clarity': 'Clarity: IF', 'Cutting': 'Cutting: Excellent', 'Treatment': 'Treatment:\xa0Heat', 'Origin': 'Origin:\xa0Africa', 'Hardness': 'Cutting: Excellent', 'Price Per Carat': 'Price Per Carat: $430', 'title': 'Pink Tourmaline 2.78 Carats', 'price': '$1,195.40', 'link': '/collections/gemstone/products/pink-tourmaline-2-78-carats-oval'}, {'Weight': 'Weight (Carats): 2.59', 'Shape': 'Shape: Pear', 'Dimensions': 'Dimensions (L x W x D) (mm): 12.0 x 7.5 x 5.4', 'Color': 'Color: Green', 'Clarity': 'Clarity: IF', 'Cutting': 'Cutting: Excellent', 'Treatment': 'Treatment:\xa0Heat', 'Origin': 'Origin:\xa0Africa', 'Hardness': 'Cutting: Excellent', 'Price Per Carat': 'Price Per Carat: $230', 'title': 'Green Tourmaline 2.59 Carats', 'price': '$595.70', 'link': '/collections/gemstone/products/green-tourmaline-2-59-carats-pear'}]

Post a Comment for "I Scraped Title And Price And Links And Info Table I Name It Planet_data And When I Write Csv File I Get Duplicated Planet_data"