初学者如何用“python爬虫”技术抓取网页数据？

发表于： 2023年2月7日 2023年2月7日
分类：未分类
import requests
from bs4 import BeautifulSoup

class Spider():
    """爬虫类目录"""
    # 1.def __init__=初始化请求网页，可输入请求地址
    # 2.def print_html=打印请求的网页HTMl代码
    # 3.def webcollector=采集网页代码保存到本地
    # 4.def find=寻找特定网页内容并打印输出
    def __init__(self, url_input='https://boards.4channel.org/w/'):
        """1.爬虫请求get网页"""
        print('--爬虫启动--')
        print('....')
        print('请求网页=',url_input)
        self.url=url_input
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44'}
        self.response = requests.get(url=self.url, headers=self.headers)

    def print_html(self):
        """2.打印网页代码"""
        print(self.response.text)
        print('-'*50)
        print(self.url,'HTML RESULT')

    def webcollector(self):
        """3.采集网页代码保存到本地"""
        print('采集网页=', self.url)
        fileName = self.url.split('.')[1] + '采集结果' + '.html'
        with open(fileName, 'w', encoding='utf-8') as fp:
            fp.write(self.response.text)
            print('-' * 50)
            print('网页采集成功!查看本目录')

    def find(self,find_input_keyword='title'):
        """4.寻找特定网页内容并打印输出"""
        # find_input_keyword参数说明=
            # 'title=寻找<title></title>的内容'
            # 'a'=寻找<a></a>标签中的内容
            # 'img'=寻找<img>标签中的内容
            # 'p'=寻找<p></p>标签内容
        soup = BeautifulSoup(self.response.content,'lxml')
        self.find_all = soup.find_all(find_input_keyword)
        for elements in self.find_all:
            print(elements)


# 1.def __init__=初始化请求网页，可输入请求地址
taobao=Spider(url_input='https://www.taobao.com/')
# 2.def print_html=打印请求的网页HTMl代码
taobao.print_html()
# 3.3.def webcollector=采集网页代码保存到本地
taobao.webcollector()
# 4.寻找特定网页内容并打印输出
taobao.find(find_input_keyword='title')
admin

2225