利用爬虫技术能做到哪些很酷很有趣很有用的事情？

发表于： 2023年2月6日 2023年2月6日
分类：未分类

谢邀。

我用爬虫爬了我爱白菜网、超值分享汇、发现值得买、惠惠购物、今日聚超值、留住你、买手党、没得比、慢慢买、牛杂网、买个便宜货、什么值得买、天上掉馅饼、一分网、折800值得买、值值值等网站的折扣信息。

这些网站都是提供的一些及时的、性价比较高的商品，很多时候要一个一个网站的看（重度用户），很容易就会错过一些很划算的商品。

于是用Python抓取了这些打折信息，并输出到网站上(

)。

#-*- coding: utf-8 -*-
from Haohuola.Base import p
from bs4 import BeautifulSoup
import PostMessage,json
from se import getGoodsUrl
from qiniuUpload import getImageUrl
from Haohuola.Base import getHtml,getMallCountry,get_title_price
from Haohuola.Base import getTags,handle_content
from Haohuola.Category import getCategory

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4,ja;q=0.2,fr;q=0.2,es;q=0.2,ru;q=0.2',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Host':'www.czfxh.com',
}

html=getHtml('http://www.czfxh.com',headers).decode('utf-8')
soup=BeautifulSoup(html)
AUTHOR=u"超值分享汇" 
soupid=soup.find(id="main")
items=soupid.find_all("div",{"class":"post"})[0:8]
items.reverse()

for item in items:
    try:
        link=item.find('div',{'class':'buy_url'}).a['href'].encode('utf-8')  #原始购买链接
        title=item.h2.get_text().encode('utf-8')
        if PostMessage.urlToSQL(link)==1:
            tuple_url=getGoodsUrl(link)
            links=tuple_url[0]
            mobile=tuple_url[2]
            if not links:
                continue
            if PostMessage.transToSQL(tuple_url)==1:
                title=item.h2.get_text().encode('utf-8')
                alltitle=get_title_price(title)
                price=alltitle['price']                              #提取出价格来
                article_title=alltitle['article_title']              #标题
                publish_title=alltitle['publish_title']
                content=str(item.find('div',{'class':'content'}).select('.conBox')[0])
                publish_content,img_url=handle_content(content)
                image_url=getImageUrl(img_url,publish_title)
                MallCountry = getMallCountry(links) #获取商城和国内还是海淘(links是转换后的链接)
                country     = MallCountry["region"] #中国的或者是海淘的
                mall        = MallCountry["mall"] #商品来自拿个商城
                Category    = getCategory(publish_title)
                category    = Category['code']
                tags        = getTags(Category,mall) #商品标签
                res=PostMessage.postMessage(publish_title,publish_content,AUTHOR,links,mall,image_url,price,tags,country,article_title,category,mobile)
                
                p(u'---start---',publish_title,publish_content,AUTHOR,links,mall,image_url,price,tags,country,article_title,category,mobile,u'---end---')

                if res==True:
                    PostMessage.insertIntoOriginal(link)
                    PostMessage.insertIntoSQL(tuple_url)

            else:
                PostMessage.insertIntoOriginal(link)
                continue

    except Exception, e:
        print e
        continue