Python 韓都衣舍雙11打折力度如何爬取
分析: 韓都衣舍旗艦店首頁網址為:https://handuyishe.tmall.com/
所有商品的網址為:
分析思路如下:在火狐流覽器上按下F12, 同時按下F5
可以觀察商品是通過JS觸發的, 網址如下:
https://handuyishe.tmall.com/i/asynSearch.htm
其中有效參數如下:
callback=jsonp140
mid=w-14593428692-0
path=/category.htm
scene=taobao_shop
pageNo=2
現在重點即獲取mid值, 其中pageNo為頁碼
如何獲取mid值
查看網頁原始程式碼, 搜索w-14593428692-0
如何獲取頁碼:在商品頁的上面有頁碼數, 如下所示2/57, 其中57即為頁碼總數
點開任意一個商品連結, 按下F12, 觀察商品的價格, 可以發現價格也是通過JS觸發的, 其中觸發網址為
https://mdskip.taobao.com/core/initItemDetail.htm, 需要的主要參數為itemId,itemID在商品網址上存在該參數
原始程式碼:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import urllib
import re
from urllib.request import Request as URequest
from urllib.parse import urlencode
import ssl
class ClothesSpider(scrapy.Spider):
name = 'clothes'
allowed_domains = ['handuyishe.tmall.com','taobao.com']
start_urls = ['https://handuyishe.tmall.com/']
headers={
'Host':'handuyishe.tmall.com',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Connection':'keep-alive',
'Referer':'https://handuyishe.tmall.com/'
}
Cookie=登錄天貓後按下F12複製任意一個網頁的cookie字串即可
cookies={}
for c in Cookie.split(';'):
cookies[c.split('=')[0]]=c.split('=')[1]
mid=''
pages=0
def start_requests(self):
url='https://handuyishe.tmall.com/'
yield Request(url,callback=self.CatelogPage,headers=self.headers,cookies=self.cookies)
def CatelogPage(self,response):
url='https://'+response.xpath('//a[text()="全部分類"]/@href').extract()[0]
yield Request(url,headers=self.headers,cookies=self.cookies,callback=self.GoodsPage)
#獲取mid值
def GoodsPage(self,response):
data=response.xpath('//input[@id="J_ShopAsynSearchURL"]/@value').extract()[0]
self.mid=re.compile('mid=(.*?)&').findall(data)[0]
url='https://handuyishe.tmall.com/i/asynSearch.htm?mid='+self.mid+'&path=/category.htm&scene=taobao_shop'
yield Request(url,headers=self.headers,cookies=self.cookies,callback=self.GetPageNumber)
#獲取頁碼
def GetPageNumber(self,response):
self.pages=re.compile(b'ui-page-s-len.*?>1/(.*?)').findall(response.body)[0]
for page in range(int(self.pages)):
url='https://handuyishe.tmall.com/i/asynSearch.htm?mid='+self.mid+'&path=/category.htm&scene=taobao_shop&pageNo='+str(page+1)
yield Request(url,headers=self.headers,cookies=self.cookies,callback=self.GetAllPages)
#爬取所有頁面
def GetAllPages(self,response):
if 'content-type' in response.headers:
encoding=response.headers['content-type'].split(b'charset=')[1].decode()
else:
encoding=re.compile(b'(.*?)').findall(data)
for item in items:
title=item[1]
link='//'+item[0]
itemid=re.compile('id=([0-9]*)').findall(link)[0]
prices=self.getprice(response.request.headers.getlist('Cookie')[0].decode(),itemid)
price=prices[0]
discountprice=prices[1]
#輸出結果, 也可以通過csv模組寫到Excel檔中去
print(title+':'+link+''+'原價:'+str(price)+'雙11價:'+str(discountprice))
#獲取商品的價格和雙11價
def getprice(self,cookies,itemId):
ssl._create_default_https_context=ssl._create_unverified_context
data={
'itemId':itemId,
'isApparel':'true',
'isPurchaseMallPage':'false',
'isAreaSell':'false',
'isUseInventoryCenter':'false',
'showShopProm':'false',
'cartEnable':'true',
'isSecKill':'false',
'addressLevel':'2',
'tryBeforeBuy':'false',
'household':'false',
'queryMemberRight':'true',
'sellerPreview':'false',
'service3C':'false',
'isForbidBuyItem':'false',
'tmallBuySupport':'true',
'isRegionLevel':'false',
'offlineShop':'false',
'callback':'setMdskip',
'isg':'null'
}
url='https://mdskip.taobao.com/core/initItemDetail.htm'
req=URequest(url,data=urlencode(data).encode(),headers={
'Cookie':cookies,
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
'Host':'mdskip.taobao.com',
'Referer':'https://detail.tmall.com'
})
data=urllib.request.urlopen(req).read()
price=re.compile(b'priceInfo.*?"price":"(.*?)"').findall(data)[0].decode()
discountprice=re.compile(b'suggestivePromotionList.*?"price":"(.*?)"').findall(data)[0].decode()
return price,discountprice