Python 韓都衣舍雙11打折力度如何爬取

分析：韓都衣舍旗艦店首頁網址為：https://handuyishe.tmall.com/

所有商品的網址為：

分析思路如下：在火狐流覽器上按下F12，同時按下F5

可以觀察商品是通過JS觸發的，網址如下：

https://handuyishe.tmall.com/i/asynSearch.htm

其中有效參數如下：

callback=jsonp140

mid=w-14593428692-0

path=/category.htm

scene=taobao_shop

pageNo=2

現在重點即獲取mid值，其中pageNo為頁碼

如何獲取mid值

查看網頁原始程式碼，搜索w-14593428692-0

如何獲取頁碼：在商品頁的上面有頁碼數，如下所示2/57，其中57即為頁碼總數

點開任意一個商品連結，按下F12，觀察商品的價格，可以發現價格也是通過JS觸發的，其中觸發網址為

https://mdskip.taobao.com/core/initItemDetail.htm，需要的主要參數為itemId,itemID在商品網址上存在該參數

原始程式碼：

# -*- coding: utf-8 -*-

import scrapy

from scrapy.http import Request

import urllib

import re

from urllib.request import Request as URequest

from urllib.parse import urlencode

import ssl

class ClothesSpider(scrapy.Spider):

name = 'clothes'

allowed_domains = ['handuyishe.tmall.com','taobao.com']

start_urls = ['https://handuyishe.tmall.com/']

headers={

'Host':'handuyishe.tmall.com',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',

'Connection':'keep-alive',

'Referer':'https://handuyishe.tmall.com/'

}

Cookie=登錄天貓後按下F12複製任意一個網頁的cookie字串即可

cookies={}

for c in Cookie.split(';'):

cookies[c.split('=')[0]]=c.split('=')[1]

mid=''

pages=0

def start_requests(self):

url='https://handuyishe.tmall.com/'

yield Request(url,callback=self.CatelogPage,headers=self.headers,cookies=self.cookies)

def CatelogPage(self,response):

url='https://'+response.xpath('//a[text()="全部分類"]/@href').extract()[0]

yield Request(url,headers=self.headers,cookies=self.cookies,callback=self.GoodsPage)

#獲取mid值

def GoodsPage(self,response):

data=response.xpath('//input[@id="J_ShopAsynSearchURL"]/@value').extract()[0]

self.mid=re.compile('mid=(.*?)&').findall(data)[0]

url='https://handuyishe.tmall.com/i/asynSearch.htm?mid='+self.mid+'&path=/category.htm&scene=taobao_shop'

yield Request(url,headers=self.headers,cookies=self.cookies,callback=self.GetPageNumber)

#獲取頁碼

def GetPageNumber(self,response):

self.pages=re.compile(b'ui-page-s-len.*?>1/(.*?)').findall(response.body)[0]

for page in range(int(self.pages)):

url='https://handuyishe.tmall.com/i/asynSearch.htm?mid='+self.mid+'&path=/category.htm&scene=taobao_shop&pageNo='+str(page+1)

yield Request(url,headers=self.headers,cookies=self.cookies,callback=self.GetAllPages)

#爬取所有頁面

def GetAllPages(self,response):

if 'content-type' in response.headers:

encoding=response.headers['content-type'].split(b'charset=')[1].decode()

else:

encoding=re.compile(b'(.*?)').findall(data)

for item in items:

title=item[1]

link='//'+item[0]

itemid=re.compile('id=([0-9]*)').findall(link)[0]

prices=self.getprice(response.request.headers.getlist('Cookie')[0].decode(),itemid)

price=prices[0]

discountprice=prices[1]

#輸出結果，也可以通過csv模組寫到Excel檔中去

print(title+':'+link+''+'原價:'+str(price)+'雙11價:'+str(discountprice))

#獲取商品的價格和雙11價

def getprice(self,cookies,itemId):

ssl._create_default_https_context=ssl._create_unverified_context

data={

'itemId':itemId,

'isApparel':'true',

'isPurchaseMallPage':'false',

'isAreaSell':'false',

'isUseInventoryCenter':'false',

'showShopProm':'false',

'cartEnable':'true',

'isSecKill':'false',

'addressLevel':'2',

'tryBeforeBuy':'false',

'household':'false',

'queryMemberRight':'true',

'sellerPreview':'false',

'service3C':'false',

'isForbidBuyItem':'false',

'tmallBuySupport':'true',

'isRegionLevel':'false',

'offlineShop':'false',

'callback':'setMdskip',

'isg':'null'

}

url='https://mdskip.taobao.com/core/initItemDetail.htm'

req=URequest(url,data=urlencode(data).encode(),headers={

'Cookie':cookies,

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',

'Host':'mdskip.taobao.com',

'Referer':'https://detail.tmall.com'

})

data=urllib.request.urlopen(req).read()

price=re.compile(b'priceInfo.*?"price":"(.*?)"').findall(data)[0].decode()

discountprice=re.compile(b'suggestivePromotionList.*?"price":"(.*?)"').findall(data)[0].decode()

return price,discountprice