diff --git a/intelmotherboards.py b/intelmotherboards.py index cb2d74c..ed6e463 100644 --- a/intelmotherboards.py +++ b/intelmotherboards.py @@ -2,11 +2,11 @@ import requests from neweggpy.nefuncs import IterPages,BoolToInt,getPIDS,getData,insertData -baseurl = 'http://m.newegg.com/ProductList?description=Intel+Motherboards' + \ - '&categoryId=280&storeId=1&nodeId=7627&parentCategoryId=20' + \ - '&isSubCategory=true&categoryType=1' +baseurl = 'https://m.newegg.com/ProductList?description=FHLEsA70dKzKoDqR2lBeblLSvjuwGkdtB%252fdjJTC%252f8VU%253d&storeid=1&categoryid=-1&nodeid=7627&storetype=2&subcategoryid=280&brandid=-1&nvalue=100007627&showseealldeals=False&itemcount=0&issubcategory=true&level=3' -pg1 = requests.get(baseurl).content +headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'} + +pg1 = requests.get(baseurl, headers=headers).content root1 = lxml.html.fromstring(pg1) page_count = IterPages(root1) URLs = ['%s&Page=%s' % (baseurl, pgnum) for pgnum in range(1, page_count + 1)] diff --git a/laptop.py b/laptop.py index 22251fa..7026ee0 100644 --- a/laptop.py +++ b/laptop.py @@ -2,11 +2,11 @@ import requests from neweggpy.nefuncs import IterPages,BoolToInt,getPIDS,getData,insertData -baseurl = 'http://m.newegg.com/ProductList?description=Laptops+%2f+' + \ - 'Notebooks&categoryId=32&storeId=3&nodeId=6740&' + \ - 'parentCategoryId=223&isSubCategory=true&categoryType=1' +baseurl = 'https://m.newegg.com/productlist?description=nVjkbn88TPpxbYCO44j7AJ7lKvYiECglej4lhl3FXQM%253d&storeid=3&categoryid=-1&nodeid=6740&storetype=2&subcategoryid=32&brandid=-1&nvalue=100006740&showseealldeals=False&itemcount=0&issubcategory=true&level=3' + +headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'} +pg1 = requests.get(baseurl, headers=headers).content -pg1 = requests.get(baseurl).content root1 = lxml.html.fromstring(pg1) page_count = IterPages(root1) URLs = ['%s&Page=%s' % (baseurl, pgnum) for pgnum in range(1, page_count + 1)] diff --git a/neweggpy/nefuncs.py b/neweggpy/nefuncs.py index db85b71..c14040c 100644 --- a/neweggpy/nefuncs.py +++ b/neweggpy/nefuncs.py @@ -9,14 +9,15 @@ import os import requests import sqlite3 +import traceback +import sys dtn = datetime.now().strftime('%Y-%m-%d %H:%M:%S') def IterPages(rootobj): - t = rootobj.cssselect('span.colorGrey')[0].text - t = filter(lambda x: x.isdigit(), t) - return int(ceil(int(t)/20)) + t = rootobj.cssselect('#pagesNum > option:nth-child(1)')[0].text + return int(ceil(int(t[2:])/20)) def BoolToInt(boolobj): @@ -31,13 +32,15 @@ def getPIDS(urlList, pg1root): ProductList = [] for k, url in enumerate(urlList): if k is 0: # Reuse the root object for the first page - for el in pg1root.cssselect('a.listCell'): + for el in pg1root.cssselect('a.item-cell'): ProductList.append(el.attrib['href']) else: - r = requests.get(url).content - for el in fromstring(r).cssselect('a.listCell'): + headers = { + 'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'} + r = requests.get(url, headers=headers).content + for el in fromstring(r).cssselect('a.item-cell'): ProductList.append(el.attrib['href']) - pids = [i.split('=')[1] for i in ProductList if i.count('itemNumber=') == 1] + pids = [i.replace('https://m.newegg.com/products/', '') for i in ProductList] return pids @@ -45,51 +48,61 @@ def getData(pidList): apiurl = 'http://www.ows.newegg.com/Products.egg' OutData = [] for pid in pidList: + print pid sleep(1) try: - r = requests.get('%s/%s' % (apiurl, pid)).content + url = '%s/%s' % (apiurl, pid) + headers = { + 'User-Agent': 'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0'} + r = requests.get(url,headers=headers).content js = loads(r) + basic = js['Basic'] + additional = js['Additional'] g = {} - g['Title'] = js['Title'] - final_price = js['FinalPrice'].replace(',', '') + g['Title'] = basic['Title'] + final_price = basic['FinalPrice'].replace(',', '') if final_price.count('Checkout') == 1: g['FinalPrice'] = float('NaN') elif final_price == 'See price in cart': - g['FinalPrice'] = float(js['MappingFinalPrice'].replace(',', '').replace('$', '')) + g['FinalPrice'] = float(basic['MappingFinalPrice'].replace(',', '').replace('$', '')) else: g['FinalPrice'] = float(final_price.replace('$', '')) - g['OriginalPrice'] = float(js['OriginalPrice'].replace(',', '').replace('$', '')) - g['Instock'] = BoolToInt(js['Instock']) - g['Rating'] = js['ReviewSummary']['Rating'] + if (basic['OriginalPrice'] != ''): + g['OriginalPrice'] = float(basic['OriginalPrice'].replace(',', '').replace('$', '')) + else: + g['OriginalPrice'] = 0.0 + g['Instock'] = BoolToInt(basic['Instock']) + g['Rating'] = basic['ReviewSummary']['Rating'] try: - g['TotalReviews'] = le(js['ReviewSummary']['TotalReviews'])[0] + g['TotalReviews'] = le(basic['ReviewSummary']['TotalReviews'])[0] except: g['TotalReviews'] = 0 - g['IsHot'] = BoolToInt(js['IsHot']) - ShippingPrice = js['ShippingInfo']['NormalShippingText'].split(' ')[0] - if ShippingPrice.count('Free') == 1: + g['IsHot'] = BoolToInt(basic['IsHot']) + ShippingPrice = basic['ShippingText'].split(' ')[0] + if ShippingPrice.count('FREE') == 1: g['ShippingPrice'] = 0.0 elif ShippingPrice.count('Special') == 1: g['ShippingPrice'] = 2.99 # "Special shipping => $2.99 Egg Saver Shipping" else: g['ShippingPrice'] = float(ShippingPrice.replace('$', '')) - g['IsShipByNewegg'] = BoolToInt(js['IsShipByNewegg']) + g['IsShipByNewegg'] = BoolToInt(additional['ShippingInfo']['IsShipByNewegg']) - if len(js['PromotionText']) > 0: - g['Promotion'] = js['PromotionText'] + if len(basic['PromotionText']) > 0: + g['Promotion'] = basic['PromotionText'] else: g['Promotion'] = 'NaN' - MIR = js['MailInRebateInfo'] + MIR = additional['MailInRebates'] if MIR is None: g['MailInRebateInfo'] = 'NaN' else: - g['MailInRebateInfo'] = js['MailInRebateInfo'][0] + g['MailInRebateInfo'] = additional['MailInRebates'][0] g['PID'] = pid - g['Brand'] = js['CoremetricsInfo']['Brand'] + g['Brand'] = basic['ItemBrand']['Description'] g['Date'] = dtn OutData.append(g) - except: - print 'FAILED: %s' % pid + except Exception, e: + print 'FAILED: %s %s' % (pid, e) + traceback.print_exc() pass dframe = DataFrame(OutData) dframe['FinalPriceShipped'] = dframe['FinalPrice'] + dframe['ShippingPrice']