|
| 1 | +''' |
| 2 | +Import the necessary libraries |
| 3 | +''' |
| 4 | +# !pip install selenium |
| 5 | +from selenium import webdriver |
| 6 | +import time |
| 7 | +import pandas as pd |
| 8 | +from bs4 import BeautifulSoup as soup |
| 9 | + |
| 10 | +''' |
| 11 | +Define the browser/driver and open the desired webpage |
| 12 | +''' |
| 13 | +driver = webdriver.Chrome( |
| 14 | + 'D:\\Softwares\\chromedriver_win32\\chromedriver.exe' |
| 15 | +) |
| 16 | +driver.get('https://www.cardekho.com/filter/new-cars') |
| 17 | +''' |
| 18 | +Keep scrolling automatically and extract the data from the webpage and store it |
| 19 | +''' |
| 20 | +for i in range(0, 20): |
| 21 | + driver.execute_script("window.scrollTo(0, document.body.scrollHeight)") |
| 22 | + time.sleep(1) |
| 23 | + driver.execute_script("window.scrollTo(0, \ |
| 24 | + (document.body.scrollHeight)*0.73)") |
| 25 | + time.sleep(1) |
| 26 | +res = driver.execute_script("return document.documentElement.outerHTML") |
| 27 | +driver.quit() |
| 28 | +psoup = soup(res, "lxml") |
| 29 | +containers = psoup.findAll( |
| 30 | + "div", {"gsc_col-md-12 gsc_col-sm-12 gsc_col-xs-12 append_list"} |
| 31 | +) |
| 32 | +cars = [] |
| 33 | +prices = [] |
| 34 | +engines = [] |
| 35 | +mileages = [] |
| 36 | +for i in containers: |
| 37 | + # cars.append(i.div.img["alt"]) |
| 38 | + price = i.findAll("div", {"class": "price"}) |
| 39 | + q = price[0].text |
| 40 | + s = "" |
| 41 | + for h in q: |
| 42 | + if h != "*": |
| 43 | + s += h |
| 44 | + else: |
| 45 | + break |
| 46 | + prices.append(s) |
| 47 | + m = i.findAll("div", {"class": "dotlist"}) |
| 48 | + f = m[0].findAll("span", {"title": "Mileage"}) |
| 49 | + if len(f) != 0: |
| 50 | + mileages.append(f[0].text) |
| 51 | + else: |
| 52 | + mileages.append(" ") |
| 53 | + e = m[0].findAll("span", {"title": "Engine Displacement"}) |
| 54 | + if len(e) != 0: |
| 55 | + engines.append(e[0].text) |
| 56 | + else: |
| 57 | + engines.append(" ") |
| 58 | +df = pd.DataFrame( |
| 59 | + { |
| 60 | + 'Car Name': cars, |
| 61 | + 'Price': prices, |
| 62 | + 'Engine': engines, |
| 63 | + 'Mileage': mileages |
| 64 | + } |
| 65 | +) |
| 66 | +df.to_csv('carScrap.csv', index=False, encoding='utf-8') |
0 commit comments