-
Notifications
You must be signed in to change notification settings - Fork 1
python web scraping
changwu edited this page Mar 28, 2016
·
5 revisions
get 方法
import urllib2
url = "http://www.baidu.com"
response = urllib2.urlopen(url)
print response.read()post 方法
import urllib
import urllib2
url = "http://abcde.com"
form = {'name':'abc', 'password':'1234'}
form_data = urllib.urlencode(form)
request = urllib2.Request(url, form_data)
response = urllib2.urlopen(request)
print response.read()urllib2 的 ProxyHandler
import urllib2
proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
print response.read()cookielib, CookieJar()
import urllib2, cookielib
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie_support)
urllib2.install_opener(opener)
content = urllib2.urlopen('http://XXXX').read()urllib2 出現 HTTP Error 403: Forbidden
- User-Agent
- Content-Type
import urllib2
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = urllib2.Request(
url = 'http://my.oschina.net/jhao104/blog?catalog=3463517',
headers = headers
)
print urllib2.urlopen(request).read()- regex
- lxml, BeautifulSoup
Accept-encoding: gzip
import urllib2, httplib
request = urllib2.Request('http://xxxx.com')
request.add_header('Accept-encoding', 'gzip') 1
opener = urllib2.build_opener()
f = opener.open(request)解壓縮
import StringIO
import gzip
compresseddata = f.read()
compressedstream = StringIO.StringIO(compresseddata)
gzipper = gzip.GzipFile(fileobj=compressedstream)
print gzipper.read()from threading import Thread
from Queue import Queue
from time import sleep
q = Queue()
NUM = 2
JOBS = 10
def do_somthing_using(arguments):
print arguments
def working():
while True:
arguments = q.get()
do_somthing_using(arguments)
sleep(1)
q.task_done()
for i in range(NUM):
t = Thread(target=working)
t.setDaemon(True)
t.start()
for i in range(JOBS):
q.put(i)
q.join()