python 爬虫汇总
作者:jit 日期:2016-03-30
import urllib, urllib2, re, requests, os, sys, cookielib
import gzip
import StringIO
#伪造headers 方式:
url = "http://am.22.cn"
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'ASP.NET_SessionId=v3s4qeaysqitplcsp2en142e; CNZZDATA906087=cnzz_eid%3D1527837511-1447985608-http%253A%252F%252Fwww.22.cn%252F%26ntime%3D1459259942',
'Host': 'am.22.cn',
'Referer':'http://am.22.cn/ykj/?t=0.21097470237873495&ddlSuf=0&keyword=wmLjx&chkorder=-1&chkday=-1&position=&position1=&position2=&MinPrice=0&MaxPrice=&selMinLen=1&selMaxLen=200&dealtype=2&keytype=0&issch=1&showtype=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.11466.7 Safari/537.36'
}
request = urllib2.Request(url, headers=headers)
resp = urllib2.urlopen(request).read()
data = StringIO.StringIO(resp)
gzipper = gzip.GzipFile(fileobj=data)
html = gzipper.read()
print html
#cookies方式:
#cookies自动提交在opener里
post_data = {"module": "enterzone", "login_name": "", "password": ""}
##cookies,访问一次获取cookies
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
url = "http://www.xxxx.com/login"
request = urllib2.urlopen(url).read()
#编码post字典数据
postdata = urllib.urlencode(post_data)
resp = opener.open(url, postdata).read()
#验证码识别
评论: 0 | 引用: 0 | 查看次数: -
发表评论