python web scraping

1. 讀取網頁內容

get 方法

import urllib2

url = "http://www.baidu.com"
response = urllib2.urlopen(url)
print response.read()

post 方法

import urllib
import urllib2

url = "http://abcde.com"
form = {'name':'abc', 'password':'1234'}
form_data = urllib.urlencode(form)
request = urllib2.Request(url, form_data)
response = urllib2.urlopen(request)
print response.read()

2. 使用 proxy

urllib2 的 ProxyHandler

import urllib2

proxy = urllib2.ProxyHandler({'http': '127.0.0.1:8087'})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
print response.read()

3. 處理 Cookies

cookielib, CookieJar()

import urllib2, cookielib

cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie_support)
urllib2.install_opener(opener)
content = urllib2.urlopen('http://XXXX').read()

4. 偽裝瀏覽器

urllib2 出現 HTTP Error 403: Forbidden

User-Agent
Content-Type

import urllib2

headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
request = urllib2.Request(
    url = 'http://my.oschina.net/jhao104/blog?catalog=3463517',
    headers = headers
)
print urllib2.urlopen(request).read()

5. Parsing

regex
lxml, BeautifulSoup

6. 驗證碼 (Captcha)

7. gzip

Accept-encoding: gzip

import urllib2, httplib
request = urllib2.Request('http://xxxx.com')
request.add_header('Accept-encoding', 'gzip')        1
opener = urllib2.build_opener()
f = opener.open(request)

解壓縮

import StringIO
import gzip

compresseddata = f.read() 
compressedstream = StringIO.StringIO(compresseddata)
gzipper = gzip.GzipFile(fileobj=compressedstream) 
print gzipper.read()

8. 平行化

from threading import Thread
from Queue import Queue
from time import sleep

q = Queue()
NUM = 2
JOBS = 10


def do_somthing_using(arguments):
    print arguments


def working():
    while True:
        arguments = q.get()
        do_somthing_using(arguments)
        sleep(1)
        q.task_done()


for i in range(NUM):
    t = Thread(target=working)
    t.setDaemon(True)
    t.start()


for i in range(JOBS):
    q.put(i)

q.join()

python web scraping

1. 讀取網頁內容

2. 使用 proxy

3. 處理 Cookies

4. 偽裝瀏覽器

5. Parsing

6. 驗證碼 (Captcha)

7. gzip

8. 平行化

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!