Python爬虫用户代理池
# 用户代理池 # 如果经常用一个浏览器伪装爬,很容易被发现,这时候用多个代理(多个浏览器)访问 # 用户代理池:用多个浏览器标识构成一个集合,相当于一个池子,随机使用一个 # 让网站以为很多人访问,网站更难识别 import urllib.request #爬网页先导入这个 import random # 需要导入随机模块 uapools= [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (kHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0" ] def UA() : opener=urllib.request.build_opener() thisua=random.choice(uapools) ua=("User-Agent",thisua) opener.addheaders=[ua] urllib.request.install_opener(opener) print("当前使用UA:"+str(thisua)) url="https://www.qiushibaike.com/" for i in range(0,10) : UA() data=urllib.request.urlopen(url).read().decode("utf-8","ignore") print(len(data)) # 思考:如何实现每爬3次换一次uA for i in range(0,10): if (i%3==0): UA() data=urllib.request.urlopen(url).read().decode("utf-8","ignore") +++ # 批量爬取糗事百科段子数据 # 目标站点: http://www.qiushibaike.com/ # 目标数据: 热门段子 # 要求: 实现自动翻页 # 做爬虫前要先分析 #首先打开网页源数据,定位要爬的数据,分析上下文,找到标志性标签 # 找到<div class="content">...</div> #翻页功能:观察翻页时网址的变化:https://www.qiushibaike.com/text/page/1/ import urllib.request #爬网页先导入这个 import random # 需要导入随机模块 import re uapools= [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (kHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0" ] def UA() : opener=urllib.request.build_opener() thisua=random.choice(uapools) ua=("User-Agent",thisua) opener.addheaders=[ua] urllib.request.install_opener(opener) #print("当前使用UA:"+str(thisua)) for i in range(0,35) : if (i%3==0): UA() thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/" #data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") try: data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") pat='<div class="content">(.*?)</span>(.*?)</span>.*?</div>' rst=re.compile(pat,re.S).findall(data) for j in range(0,len(rst)): print(rst[j]) print("------") except Exception as err: pass # 将爬到的内容写入文件 # 前面相同 ff1=open("e:/qiushi.txt","w") ff1.write("") pK=0 for i in range(0,35) : if (i%3==0): UA() thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/" #data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") try: data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") pat='<div class="content">(.*?)</span>(.*?)</span>.*?</div>' rst=re.compile(pat,re.S).findall(data) for j in range(0,len(rst)): pK=pK+1 print(str(i),str(j)) ff1.write(str(pK)+"\r\n") # 文件中写入换行要加\r\n ff1.write("-------\r\n") #print("------") ff1.write(str(rst[j])+"\r\n") #print(rst[j]) except Exception as err: print(err) ff1.close
end;
【版權聲明】
本文爲原創,遵循CC 4.0 BY-SA版權協議!轉載時請附上原文鏈接及本聲明。
原文鏈接:https://tdlib.com/am.php?t=b5XVw9srtVju Tag: Python 用户代理池 浏览器伪装