Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea
4 changes: 0 additions & 4 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

11 changes: 0 additions & 11 deletions .idea/pofAutoBot.iml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/vcs.xml

This file was deleted.

637 changes: 0 additions & 637 deletions .idea/workspace.xml

This file was deleted.

224 changes: 157 additions & 67 deletions main/src/scraper.py → main/src/cl_alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from email.MIMEText import MIMEText
import mechanize
import cookielib
import ssl


#uncomment these 2 lines of code if you get the below error. Some unicode encoding stuff
Expand All @@ -17,12 +18,14 @@

stubFilename='carIdHashTable.json'
queryStringStubForTucson='http://tucson.craigslist.org/search/cto?'
queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=23&agehigh=99&miles=10&contacted=2&cmdSearch=Refine+Matches'
queryStringForViewMatches='http://www.pof.com/viewmatches.aspx?agelow=25&agehigh=35&miles=10&contacted=2&cmdSearch=Refine+Matches'
queryStringForBasicSearchPage='https://www.pof.com/basicsearch.aspx'
firstQueryString='http://www.pof.com/'
numberOfGoogleResults=1000
stubMessage='Hey, nice profile. Must say you have a very nice smile. Are you from Tucson originally?'
stubMessage='Hey, nice profile. Love your smile. Are you from Tucson originally?'
startValue=1
stubUrlForPof='http://www.pof.com/'
stubUrlForBasicSearchPage='http://www.pof.com/'
stubUrlForTucsonCLInnerpages='http://tucson.craigslist.org/'
stubUrlForPhxCLInnerpages='http://phoenix.craigslist.org/'
username=""
Expand All @@ -38,7 +41,7 @@
#turn this to true, if pushing to run on chung.cs.arizona.edu
isRunningOnServer=False;
firstTimeRun=False;

useBasicSearchPage=True

if(firstTimeRun):
bodyOfEmail="Hi, \n Here is a list of all the cars found today in Craigslist. This is the very first email of craigslist scraping for used cars. Tomorrow onwards you will be shown only new hits that were not sent today. These are the parameters used for this query:\n\n"
Expand Down Expand Up @@ -177,6 +180,142 @@ def readFromJsonToHashtable(filename):
carIdHashTable = {}
return htMyTable

def send_from_basic_search_page(br,queryStringForViewMatches):
already_sent_today={'viewprofile.aspx?profile_id=82509149':1}
try:
url = br.open(queryStringForViewMatches)
except urllib2.HTTPError, e:
print('HTTPError = ' + str(e.code))
except urllib2.URLError, e:
print('URLError = ' + str(e.reason))
except httplib.HTTPException, e:
print('HTTPException')
except Exception:
import traceback
print('generic exception: ' + traceback.format_exc())
else:
content = url.read()

print("succesfully logged into pof")
# parse the content into a format that soup understands
soup = bs4.BeautifulSoup(content, "lxml")
# for each of the hyperlinks in the page
counter = 0
for link in soup.find_all('a'):
# print(link)
classResult = link.get('class')
if (classResult != None):
# if the class exists, get the link, if its not null
linkToNextPage = link.get('href')
if (linkToNextPage != None):
#check if this hyperlink has a profile id
if("profile_id" in linkToNextPage):
#profile_id=74824023 is my own id
if not ("profile_id=74824023" in linkToNextPage):
profilePageUrl = stubUrlForBasicSearchPage + linkToNextPage
# print(profilePageUrl)
# once you get the link to the person'as profile, open and go into that page.
else:
continue
else:
continue

try:
br.open(profilePageUrl)
# for f in br.forms():
# print f

# Select the first form (the first form is the quick message form)
br.select_form(nr=0)

# User credentials
br.form['message'] = stubMessage

# submit the text
if not(linkToNextPage in already_sent_today.keys()):
br.submit()
already_sent_today[linkToNextPage]=1
else:
continue

counter = counter + 1
print("sent message to " + profilePageUrl)

except Exception:
import traceback

print('generic exception: ' + traceback.format_exc())
# else:
# profilePageDetails = profilePage.read()

print("done sending messages to " + str(counter) + "people")
sys.exit(1)


def send_from_view_matches_page(br,queryStringForViewMatches):
try:
url = br.open(queryStringForViewMatches)
except urllib2.HTTPError, e:
print('HTTPError = ' + str(e.code))
except urllib2.URLError, e:
print('URLError = ' + str(e.reason))
except httplib.HTTPException, e:
print('HTTPException')
except Exception:
import traceback
print('generic exception: ' + traceback.format_exc())
else:
content = url.read()

print("succesfully logged into pof")
# parse the content into a format that soup understands
soup = bs4.BeautifulSoup(content, "lxml")
# for each of the hyperlinks in the page
counter = 0
for link in soup.find_all('a'):
# print(link)
classResult = link.get('class')
if (classResult != None):
if ("mi" in classResult):
# if the class exists, get the link, if its not null
linkToNextPage = link.get('href')
if (linkToNextPage != None):
print("\n")
profilePageUrl = stubUrlForPof + linkToNextPage
# print(profilePageUrl)
# once you get the link to the person'as profile, open and go into that page.

try:
br.open(profilePageUrl)
# for f in br.forms():
# print f

# Select the first form (the first form is the quick message form)
br.select_form(nr=0)

# User credentials
br.form['message'] = stubMessage

# submit the text
# br.submit()
counter = counter + 1
print("sent message to " + profilePageUrl)

except urllib2.HTTPError, e:
print('HTTPError = ' + str(e.code))
except urllib2.URLError, e:
print('URLError = ' + str(e.reason))
except httplib.HTTPException, e:
print('HTTPException')
except Exception:
import traceback

print('generic exception: ' + traceback.format_exc())
# else:
# profilePageDetails = profilePage.read()

print("done sending messages to " + str(counter) + "people")
sys.exit(1)

def writeToFileAsJson(myhashTable, filename):
# save to file:
Expand Down Expand Up @@ -204,6 +343,15 @@ def parseGResults(myQS):

br.addheaders = [('User-agent', 'Chrome')]

try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# Legacy Python that doesn't verify HTTPS certificates by default
pass
else:
# Handle target environment that doesn't support HTTPS verification
ssl._create_default_https_context = _create_unverified_https_context

# The site we will navigate into, handling it's session
br.open(myQS)

Expand All @@ -224,76 +372,18 @@ def parseGResults(myQS):
try:
#note:queryStringForViewMatches already contains the clause: havent contacted before. You dont want to spam
#someone you have already contacted and then get blocked
url=br.open(queryStringForViewMatches)
#url = urllib2.urlopen(queryStringToSearch)
except urllib2.HTTPError, e:
print('HTTPError = ' + str(e.code))
except urllib2.URLError, e:
print('URLError = ' + str(e.reason))
except httplib.HTTPException, e:
print('HTTPException')
except Exception:
if(useBasicSearchPage):
send_from_basic_search_page(br,queryStringForBasicSearchPage)
else:
send_from_view_matches_page(br,queryStringForViewMatches)
except:
import traceback
print('generic exception: ' + traceback.format_exc())
else:
content = url.read()


print("succesfully logged into pof")
# parse the content into a format that soup understands
soup = bs4.BeautifulSoup(content, "lxml")
# for each of the hyperlinks in the page
for link in soup.find_all('a'):
#print(link)
classResult = link.get('class')
if (classResult != None):
if ("mi" in classResult):
# if the class exists, get the link, if its not null
linkToNextPage = link.get('href')
if (linkToNextPage != None):
print("\n")
profilePageUrl = stubUrlForPof + linkToNextPage
#print(profilePageUrl)
# once you get the link to the person'as profile, open and go into that page.


try:
br.open(profilePageUrl)
#for f in br.forms():
#print f

# Select the first form (the first form is the quick message form)
br.select_form(nr=0)

# User credentials
br.form['message'] = stubMessage


# submit the text
br.submit()
print("sent message to "+profilePageUrl)

except urllib2.HTTPError, e:
print('HTTPError = ' + str(e.code))
except urllib2.URLError, e:
print('URLError = ' + str(e.reason))
except httplib.HTTPException, e:
print('HTTPException')
except Exception:
import traceback
print('generic exception: ' + traceback.format_exc())
#else:
#profilePageDetails = profilePage.read()

sys.exit(1)



except:
#print('generic exception: ')
import traceback
print('generic exception: ' + traceback.format_exc())
#+sys.exc_info()[0])




Expand Down
14 changes: 14 additions & 0 deletions main/src/okcscraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import requests

url="https://www.okcupid.com/login"

payload={
"username":"mpaul588",
"password":"Alohomora1"
}

session=requests.session()
response=requests.get(url).content
print(response)


Loading