-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathemail_extract.py
More file actions
36 lines (34 loc) · 888 Bytes
/
email_extract.py
File metadata and controls
36 lines (34 loc) · 888 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import urllib
import re
import urlparse
#example url
url="http://www.mapsofindia.com/hotels-india/westbengal/kolkata.html"
visited=[]
def extract_email(url):
page=urllib.urlopen(url)
page_data=page.read()
pat = re.compile(r'[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}')
li = pat.findall(page_data)
page.close()
cleanlist = []
[cleanlist.append(x) for x in li if x not in cleanlist]
files = open('emails.txt','a')
for show in cleanlist:
#print show
files.write(show)
files.write("\n")
print("done url ",url)
extract_email(url)
link_re = re.compile(r'href="(.*?)"')
dt=urllib.urlopen(url)
req=dt.read()
links=link_re.findall(req)
for link in links:
link_new=urlparse.urljoin(url, link)
res=bool('javascript' in link_new)
if not res:
res1=bool('css' in link_new)
if not res1:
if link_new not in visited:
extract_email(link_new)
visited.append(link_new)