urllib 和 urllib2 中设置http超时
import socket socket.setdefaulttimeout(5)
最简洁的抓取语句:
urllib.urlopen(url).read()
u = urllib.urlopen(url) c = u.read() u.close
POST数据:
u = urllib.urlopen('http://url', urllib.urlencode({'gtalk':fromid,'msg':content})) c = u.read() u.close
传递cookies
import cookielib, urllib2, urllib cj = cookielib.CookieJar() opener.addheaders=[("Cookie","dV9pZA**=Mg**; iCast2_1470_1097hO=0_2_20160; _iCast2_1470_1097hO=1")] url_post = 'http://test.soften.cn' content = opener.open(url_post, urllib.urlencode(posts)).read() opener.close() print content
匹配:
m = re.search('tags=(.*?)">', c, re.I+re.S+re.M) if m: print m.group(1)
替换:
pattern = re.compile('<style.*?</style>|<script.*?</script>', re.S | re.I) html = re.sub(pattern, ' ', html)
m = {'name' : 'somebody'; 'gender' : 'male'} s = urllib.urlencode(m) print s ##gender=male&name=somebody
content = "zhongwen zifu" urllib.quote(content) #urllib.unquote(content)