FinishLine/finishline_review.py at master · kellyho15/FinishLine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re

# open new chrome browser
driver = webdriver.Chrome()

# open csv to record data
csv_file = open('finishline_review.csv', 'w')
writer = csv.writer(csv_file)

# go to the page to scrape
driver.get("https://www.finishline.com/store/product/mens-nike-air-zoom-mariah-flyknit-racer-running-shoes/prod902579?styleId=918264&colorId=401")


# get item info
item = driver.find_element_by_xpath('//h1[@id="title"]').text
price = driver.find_element_by_xpath('//div[@class="productPrice"]/span').text


# Click review button to go to the review section
review_button = driver.find_element_by_xpath('//span[@class="BVRRNumber"]')
review_button.click()


# Page index used to keep track of where we are. ##################
index = 1
while True:
	try:
		print("Scraping Page number " + str(index))
		index = index + 1
		# Find all the reviews on the page
		wait_review = WebDriverWait(driver, 10)
		reviews = wait_review.until(EC.presence_of_all_elements_located((By.XPATH,
									'//span[@itemprop="review"]')))

		ave = driver.find_element_by_xpath('//div[@class="BVDIBody BVDI_QTBodySummaryBox"]')

		ave_stars = ave.find_element_by_xpath('//div[@class="BVRRRatingNormalImage"]/img').get_attribute('title')
		total_review = ave.find_element_by_xpath('//span[@class="BVRRCustomReviewCountNumber"]').text
		recom = ave.find_element_by_xpath('//span[@class="BVRRBuyAgainPercentage"]').text
		ave_size = ave.find_element_by_xpath('(//div[@class="BVRRRatingSliderImage"]/img)').get_attribute('title')
		ave_width = ave.find_element_by_xpath('(//div[@class="BVRRRatingSliderImage"]/img)[2]').get_attribute('title')
		ave_comfort = ave.find_element_by_xpath('(//div[@class="BVRRRatingSliderImage"]/img)[3]').get_attribute('title')


		for review in reviews:
			# Initialize an empty dictionary for each review
			review_dict = {}
			# Use relative xpath to locate the title, text, username, date.
			# Once you locate the element, you can use 'element.text' to return its string.
			# To get the attribute instead of the text of each element, use 'element.get_attribute()'

			try:
				stars = review.find_element_by_xpath('.//div[@class="BVRRRatingNormalImage"]/img').get_attribute('title')
			except NoSuchElementException:
				stars = ""

			try:
				title = review.find_element_by_xpath('.//span[@itemprop="name"]').text
			except NoSuchElementException:
				title = ""

			try:
				user = review.find_element_by_xpath('.//span[@itemprop="author"]').text
			except NoSuchElementException:
				user = ""

			try:
				review_text = review.find_element_by_xpath('.//span[@class="BVRRReviewText"]').text
			except NoSuchElementException:
				review_text = ""

			try:
				reason = review.find_element_by_xpath('.//span[@class="BVRRValue BVRRContextDataValue BVRRContextDataValuePurchaseReason"]').text
			except NoSuchElementException:
				reason = ""

			try:
				helpful = review.find_element_by_xpath('.//span[@class="BVDINumber"]').text
			except NoSuchElementException:
				helpful = ""

			try:
				not_helpful = review.find_element_by_xpath('(.//span[@class="BVDINumber"])[2]').text
			except NoSuchElementException:
				not_helpful = ""

			try:
				date = review.find_element_by_xpath('.//meta[@itemprop="datePublished"]').get_attribute('content')
			except NoSuchElementException:
				date = ""

			review_dict['item'] = item
			review_dict['price'] = price
			review_dict['ave_stars'] = ave_stars
			review_dict['total_review'] = total_review
			review_dict['recom'] = recom
			review_dict['ave_size'] = ave_size
			review_dict['ave_width'] = ave_width
			review_dict['ave_comfort'] = ave_comfort


			review_dict['stars'] = stars
			review_dict['title'] = title
			review_dict['user'] = user
			review_dict['review_text'] = review_text
			review_dict['reason'] = reason
			review_dict['helpful'] = helpful
			review_dict['not_helpful'] = not_helpful
			review_dict['date'] = date


			writer.writerow(review_dict.values())

		# Locate the next button on the page

		wait_button = WebDriverWait(driver, 10)
		next_button = wait_button.until(EC.element_to_be_clickable((By.XPATH,
									'//button[@class="button pag-button light-gray ml-1"]')))
		next_button.click()
		time.sleep(2)

	except Exception as e:
		print(e)
		csv_file.close()
		driver.close()
		break