Python爬虫实战:一则解析
原代码
来源
2024年你都用 Python 来做什么? – 愤怒的it男的回答 – 知乎
https://www.zhihu.com/question/640734882/answer/3378605504
须知
初学爬虫,阅读这篇文章的代码给我很大的启示,侵权必删.
代码
import csv
import requests
from lxml import etree
from prettytable import PrettyTable
def getData(baseUrl, data, headers):
response = requests.post(url=baseUrl, data=data, headers=headers)
html = etree.HTML(response.text)
trs = html.xpath("//table[@id='tab']/tr")
data = []
for index,tr in enumerate(trs):
text = tr.xpath("td//text()")
if index !=0:
text = [text[1].strip(),text[3],text[4],text[5],text[6],text[7],text[9].strip()]
data.append(text)
return data
def printData(result):
table = PrettyTable()
table.field_names = ["登记证号", "农药名称", "农药类别", "剂型", "总含量", "有效期至", "登记证持有人"]
table.add_rows(result)
print(table)
def saveData(result):
with open('农药登记数据.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
writer.writerows(result)
def main():
baseUrl = 'https://www.icama.cn/BasicdataSystem/pesticideRegistration/queryselect.do'
headers = {'Content-Type':'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
data = "pageNo=1&pageSize=50000&djzh=&nymc=&cjmc=&sf=&nylb=&zhl=&jx=&zwmc=&fzdx=&syff=&dx=&yxcf=&yxcf_en=&yxcfhl=&yxcf2=&yxcf2_en=&yxcf2hl=&yxcf3=&yxcf3_en=&yxcf3hl=&yxqs_start=&yxqs_end=&yxjz_start=&yxjz_end=&accOrfuzzy=2"
result = getData(baseUrl, data, headers)
printData(result)
saveData(result)
if __name__== "__main__" :
main()
代码解析
导入模块
import csv
import requests from lxml
import etree from prettytable
import PrettyTable
定义getData()函数
def getData(baseUrl, data, headers):
response = requests.post(url=baseUrl, data=data, headers=headers)
html = etree.HTML(response.text)
trs = html.xpath("//table[@id='tab']/tr")
data =[]
函数接收三个参数:基础URL<baseUrl>,要发送的数据<data>,HTTP请求头<headers>;
用request.post发送POST请求,获取相应<response>;
解析响应中的HTML内容,提取表格中的行<trs>;
for index,tr in enumerate(trs):
text = tr.xpath("td//text()")
if index !=0:
text =[text[1].strip(),text[3],text[4],text[5],text[6],text[7],text[9].strip()]
data.append(text)
return data
遍历表格中的行,提取每行单元格文本,将其添加到<data>列表中.
定义printData()函数
def printData(result):
table = PrettyTable()
table.field_names = ["登记证号", "农药名称", "农药类别", "剂型", "总含量", "有效期至", "登记证持有人"]
table.add_rows(result)
print(table)
定义saveData()函数
def saveData(result):
with open('农药登记数据.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
writer.writerows(result)
使用<csv.writer()>写入数据到CSV文件
main()函数
调用函数的部分.