欢迎光临
我们一直在努力

爬虫实战 | 图书比价工具的实现

程序设计思路

前面已经把三个网站数据爬了下来,下面把这些数据存到一个字典数组中,然后按照价格进行排序,下面介绍一些细节,并附上完整代码。

首先,我们需要把之前的代码优化一下,以当当网为例,我们为其增加一个函数参数book_list = [] ,然后我们在最后一步把爬取到的数据存储到字典中,完整代码如下:

import requests
from lxml import html

def spider(ISBN,book_list = []):
    """爬取当当网数据"""
    url = 'http://search.dangdang.com/?key={ISBN}&act=input'.format(ISBN = ISBN)

    """获取HTML内容"""
    html_data = requests.get(url).text

    #Xpath对象
    selector = html.fromstring(html_data)

    ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
    #print(ul_list[0])
    for li in ul_list:
        #标题
        title = li.xpath('a/@title')
        print(title[0])
        #链接
        link = li.xpath('a/@href')
        print(link[0])
        #价格
        price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')
        #备用价格,当为当当自营时,无法获取price,原因是此时多出一个div标签
        if len(price)==0:
            price_res = li.xpath('div[@class="ebook_buy"]/p[@class="price e_price"]/span[@class="search_now_price"]/text()')
            price = price_res
        price[0] = price[0].replace('¥', '')
        print(price[0])
        #商家
        store = li.xpath('p[@class="search_shangjia"]/a/text()')
        #当商家为当当自营时无法取得数字,因此做下面的判断
        if len(store)==0:
            store = '当当自营'
        print(store)
        print("---------")

        book_list.append({
            'title': title[0],
            'price': price[0],
            "link": link[0],
            'store': store[0]
        })

京东和一号店同样原理进行操作:

import requests
from lxml import html

def spider(IBSN,book_list = []):
    url = 'https://search.yhd.com/c0-0/k{IBSN}/'.format(IBSN=IBSN)
    #获取HTML对象
    requ = requests.get(url).text
    #获取Xpath对象
    selector = html.fromstring(requ)
    #获取列表集合
    ul_list = selector.xpath('//div[@id="itemSearchList"]/div')
    for li in ul_list:
        #爬取标题时,发现获得两个元素的数组,其中后一个元素为商铺名称
        title = li.xpath('div/p["@proName clearfix"]/a/@title')
        print(title[0])
        link = li.xpath('div/p["@proName clearfix"]/a/@href')
        link[0] = 'http:' + link[0]
        print(link[0])
        price = li.xpath('div/p["@proPrice"]/em/@yhdprice')
        print(price[0])
        store = li.xpath('div/p["@proName clearfix"]/a/@title')
        print(store[1])
        print('----------')

        book_list.append({
            'title': title[0],
            'price': price[0],
            "link": link[0],
            'store': store[1]
        })
import requests
from lxml import html

headers = {'authority': 'search.jd.com',
            'method': 'GET',
            'path': '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461',
            'scheme': 'https',
            'referer': 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
            'x-requested-with': 'XMLHttpRequest',
            'Cookie':'qrsc=3; pinId=RAGa4xMoVrs; xtest=1210.cf6b6759; ipLocation=%u5E7F%u4E1C; _jrda=5; TrackID=1aUdbc9HHS2MdEzabuYEyED1iDJaLWwBAfGBfyIHJZCLWKfWaB_KHKIMX9Vj9_2wUakxuSLAO9AFtB2U0SsAD-mXIh5rIfuDiSHSNhZcsJvg; shshshfpa=17943c91-d534-104f-a035-6e1719740bb6-1525571955; shshshfpb=2f200f7c5265e4af999b95b20d90e6618559f7251020a80ea1aee61500; cn=0; 3AB9D23F7A4B3C9B=QFOFIDQSIC7TZDQ7U4RPNYNFQN7S26SFCQQGTC3YU5UZQJZUBNPEXMX7O3R7SIRBTTJ72AXC4S3IJ46ESBLTNHD37U; ipLoc-djd=19-1607-3638-3638.608841570; __jdu=930036140; user-key=31a7628c-a9b2-44b0-8147-f10a9e597d6f; areaId=19; __jdv=122270672|direct|-|none|-|1529893590075; PCSYCityID=25; mt_xid=V2_52007VwsQU1xaVVoaSClUA2YLEAdbWk5YSk9MQAA0BBZOVQ0ADwNLGlUAZwQXVQpaAlkvShhcDHsCFU5eXENaGkIZWg5nAyJQbVhiWR9BGlUNZwoWYl1dVF0%3D; __jdc=122270672; shshshfp=72ec41b59960ea9a26956307465948f6; rkv=V0700; __jda=122270672.930036140.-.1529979524.1529984840.85; __jdb=122270672.1.930036140|85.1529984840; shshshsID=f797fbad20f4e576e9c30d1c381ecbb1_1_1529984840145'
            }

def spider(ISBN,book_list = []):
    url = 'https://search.jd.com/Search?keyword={ISBN}'.format(ISBN = ISBN)
    #获取HTML文档
    requ = requests.get(url,headers=headers)
    print(requ.encoding)
    requ.encoding ='utf-8'
    html_data = requ.text
    #获取Xpath对象
    selector = html.fromstring(html_data)
    #找到列表的集合
    ul_list = selector.xpath('//div[@id="J_goodsList"]/ul/li')
    print(len(ul_list))
    for li in ul_list:
        title = li.xpath('div/div[@class="p-name"]/a/@title')
        print(title[0])
        link = li.xpath('div/div[@class="p-name"]/a/@href')
        link[0] = 'https:' + link[0]
        print(link[0])
        price = li.xpath('div/div[@class="p-price"]/strong/i/text()')
        print(price[0])
        store = li.xpath('div/div[@class="p-shopnum"]/a/@title')
        print(store[0])
        print(('----------'))

        book_list.append({
            'title':title[0],
            'price':price[0],
            "link":link[0],
            'store':store[0]
        })

下面,我们把这三个函数引入到新创建的python文件中,依次执行,再把所有数据都存到一个字典列表中,进行排序。备注一下字典列表的排序方式:

lis = [{ "name" : "Taobao", "age" : 100},  
{ "name" : "Runoob", "age" : 7 }, 
{ "name" : "Google", "age" : 100 }, 
{ "name" : "Wiki" , "age" : 200 }] 
  
# 通过 age 升序排序
print ("列表通过 age 升序排序: ")
print (sorted(lis, key = lambda i: i['age']) )
  
print ("\r") 
  
# 先按 age 排序,再按 name 排序
print ("列表通过 age 和 name 排序: ")
print (sorted(lis, key = lambda i: (i['age'], i['name'])) )
  
print ("\r") 
  
# 按 age 降序排序
print ("列表通过 age 降序排序: ")
print (sorted(lis, key = lambda i: i['age'],reverse=True) )

最后直接贴出来我写的完整代码:

from spider_dangdang import spider as dangdang
from  spider_jd import  spider as jd
from spider_yhd import spider as yhd

def main(ISBN):
    """图书比价工具"""
    book_list = []
    #当当网
    dangdang(ISBN,book_list)
    print('当当网数据爬取完成')
    #1号店
    yhd(ISBN, book_list)
    print('1号店数据爬取完成')
    #京东网
    jd(ISBN, book_list)
    print('京东网数据爬取完成')
    # 开始排序
    book_list = sorted(book_list, key=lambda item: float(item["price"]))
    for book in book_list:
        print(book)
    print('--------------')

if __name__ == '__main__':
    ISBN = input('请输入ISBN:')
    main(ISBN)

要注意一下,我们进行价格排序时,要把字符串转为浮点型,不然无法进行比较!

这样,一个简单的图书比价程序就完成了,后面我会想办法把数据存储到MySQL数据库中,方便之后的调用,最终通过一个简单的PythonWeb框架,为用户提供交互界面!

赞(1) 打赏
未经允许不得转载:散人研 » 爬虫实战 | 图书比价工具的实现
分享到: 更多 (0)

评论 抢沙发

5 + 1 =
  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏