«

php爬虫的post代码常用的

瑞瑞瑞 发布于 阅读:375 python


这是一段常用的循环采集某网站内容python代码:

import requests,json,time

def down(id):
    url= "https://www.test.com/mulu/" + str(id)
    print(url) 
    #dict = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36"}
    dict = {"User-Agent":"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"}
    #dict = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot)"}

    r =requests.get(url,headers=dict)
    r.encoding = r.apparent_encoding

    filename = 'D:\\test\\100000\\' + str(id) +'.txt'
    print(filename)

    if r.status_code == 200:     
        with open(filename, 'w',encoding='utf-8') as f:        
            f.write(r.text) 

    else:
        print(f"Failed to retrieve the webpage: Status code {r.status_code}")

for id in range(50000,100001):
    print(id)
    down(id)
    time.sleep(1)