Allen 2020-06-13 16:26:59 5279 0 0 0 0

Python爬虫用户代理池

# 用户代理池
# 如果经常用一个浏览器伪装爬，很容易被发现，这时候用多个代理（多个浏览器）访问
# 用户代理池：用多个浏览器标识构成一个集合，相当于一个池子，随机使用一个
# 让网站以为很多人访问，网站更难识别
import urllib.request #爬网页先导入这个
import random # 需要导入随机模块
uapools= [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (kHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0"
    ]
def UA() :
    opener=urllib.request.build_opener()
    thisua=random.choice(uapools)
    ua=("User-Agent",thisua)
    opener.addheaders=[ua]
    urllib.request.install_opener(opener)
    print("当前使用UA:"+str(thisua))
url="https://www.qiushibaike.com/"
for i in range(0,10) :
    UA()
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    print(len(data)) 

# 思考：如何实现每爬3次换一次uA
for i in range(0,10):
    if (i%3==0)：
        UA()
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")

+++
# 批量爬取糗事百科段子数据
# 目标站点： http://www.qiushibaike.com/
# 目标数据： 热门段子
# 要求: 实现自动翻页
# 做爬虫前要先分析
    #首先打开网页源数据，定位要爬的数据，分析上下文，找到标志性标签
    # 找到<div class="content">...</div>
    #翻页功能：观察翻页时网址的变化：https://www.qiushibaike.com/text/page/1/

import urllib.request #爬网页先导入这个
import random # 需要导入随机模块
import re
uapools= [
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (kHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0"
    ]
def UA() :
    opener=urllib.request.build_opener()
    thisua=random.choice(uapools)
    ua=("User-Agent",thisua)
    opener.addheaders=[ua]
    urllib.request.install_opener(opener)
    #print("当前使用UA:"+str(thisua))
for i in range(0,35) :
    if (i%3==0):
        UA()
    thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/"
    #data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    try:
        data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
        pat='<div class="content">(.*?)</span>(.*?)</span>.*?</div>'
        rst=re.compile(pat,re.S).findall(data)
        for j in range(0,len(rst)):
            print(rst[j])
            print("------")
    except Exception as err:
        pass
        
# 将爬到的内容写入文件
# 前面相同
ff1=open("e:/qiushi.txt","w")
ff1.write("")
pK=0
for i in range(0,35) :
    if (i%3==0):
        UA()
    thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/"
    #data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    try:
        data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
        pat='<div class="content">(.*?)</span>(.*?)</span>.*?</div>'
        rst=re.compile(pat,re.S).findall(data)
        for j in range(0,len(rst)):
            pK=pK+1
            print(str(i),str(j))
            ff1.write(str(pK)+"\r\n") # 文件中写入换行要加\r\n
            ff1.write("-------\r\n") #print("------")
            ff1.write(str(rst[j])+"\r\n") #print(rst[j])            
    except Exception as err:
        print(err)        
ff1.close

end;

【版權聲明】
本文爲原創，遵循CC 4.0 BY-SA版權協議！轉載時請附上原文鏈接及本聲明。
原文鏈接：https://tdlib.com/am.php?t=b5XVw9srtVju

Tag:

還沒有評論，快來搶沙發吧！ ↓

我也要發一個 · 返回首頁 · 返回[Pyhon爬虫] · 前一個 · 下一個