Add scrape konradsson
This commit is contained in:
parent
9b463367d2
commit
824e2b5462
|
|
@ -0,0 +1,124 @@
|
|||
# coding=utf-8
|
||||
import urllib2
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import xlsxwriter
|
||||
|
||||
|
||||
class ScrapeKakel(object):
|
||||
|
||||
def __init__(self):
|
||||
self._konradsson_page = "https://konradssons.com"
|
||||
self._proxy_support = urllib2.ProxyHandler({"https": "smipse:hm7gRHmj1986@proxyseso.scania.com:8080"})
|
||||
self._auth = urllib2.HTTPBasicAuthHandler()
|
||||
self._opener = urllib2.build_opener(self._proxy_support, self._auth, urllib2.HTTPSHandler)
|
||||
urllib2.install_opener(self._opener)
|
||||
self.workbook = xlsxwriter.Workbook('C:\\Users\\smipse\\kakel.xlsx')
|
||||
|
||||
def readPage(self, urlName):
|
||||
link = self._konradsson_page + urlName
|
||||
page = urllib2.urlopen(link)
|
||||
soup = BeautifulSoup(page, 'html.parser')
|
||||
|
||||
kakel_box = soup.find_all('div', attrs={'class': 'product-item'})
|
||||
|
||||
konrads_link = []
|
||||
for item in kakel_box:
|
||||
kakel_a= item.find('a')
|
||||
kakel_href = kakel_a.get('href')
|
||||
konrads_link.append(kakel_href)
|
||||
|
||||
return konrads_link
|
||||
|
||||
def getKakel(self, hrefFromPage):
|
||||
|
||||
kakelList = []
|
||||
|
||||
for linkKakel in hrefFromPage:
|
||||
link = self._konradsson_page + linkKakel
|
||||
if link == u'sortiment/kakel-och-klinker/golvvagg/granito-arkansas-gra-pavé-8165/':
|
||||
print link
|
||||
page = urllib2.urlopen(link.encode('latin1'))
|
||||
soup = BeautifulSoup(page, 'html.parser')
|
||||
productInfo = soup.find('div', attrs={'class': 'col-md-12'})
|
||||
left = productInfo.find_all(attrs={"class": 'productFeaturesLeft'})
|
||||
right = productInfo.find_all(attrs={"class": 'productFeaturesRight'})
|
||||
|
||||
if len(left) == len(right):
|
||||
tmpLeft = []
|
||||
tmpRight = []
|
||||
for item in left:
|
||||
leftText = item.text.strip()
|
||||
leftText = leftText.replace(u'ä','a')
|
||||
leftText = leftText.replace(':', '')
|
||||
if leftText.startswith('Anta'):
|
||||
leftText = 'Antal'
|
||||
|
||||
tmpLeft.append(leftText)
|
||||
for item in right:
|
||||
tmpRight.append(item.text.strip())
|
||||
tmpLeft.append('link')
|
||||
tmpRight.append(link)
|
||||
kakelInfo = dict(zip(tmpLeft,tmpRight))
|
||||
kakelList.append(kakelInfo)
|
||||
else:
|
||||
print productInfo
|
||||
|
||||
return kakelList
|
||||
|
||||
def writeToExcel(self, kakelList, name):
|
||||
|
||||
worksheet = self.workbook.add_worksheet(name=name)
|
||||
row = 0
|
||||
colPlace = {
|
||||
'Serie': 0,
|
||||
'Priskod': 1,
|
||||
'Storlek': 2,
|
||||
'Antal': 3,
|
||||
'Yta': 4,
|
||||
'Tjocklek': 5,
|
||||
'Frostsaker': 6,
|
||||
'Placering': 7,
|
||||
'link': 8,
|
||||
'Farg': 9,
|
||||
'PEI': 10,
|
||||
'Plats': 11,
|
||||
'Art nr': 12,
|
||||
'Arkstorlek': 13,
|
||||
}
|
||||
|
||||
for item in kakelList:
|
||||
col = 0
|
||||
for key,value in item.iteritems():
|
||||
if row == 0:
|
||||
worksheet.write(row, colPlace[key], key)
|
||||
worksheet.write(row + 1, colPlace[key], value)
|
||||
else:
|
||||
worksheet.write(row, colPlace[key], value)
|
||||
col +=1
|
||||
if row == 0:
|
||||
row += 2
|
||||
else:
|
||||
row += 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
kakel = ScrapeKakel()
|
||||
wall = '/sortiment/kakel-och-klinker/vagg'
|
||||
floorWall = '/sortiment/kakel-och-klinker/golvvagg/'
|
||||
natursten = '/sortiment/natursten/'
|
||||
|
||||
kakelWallHref = kakel.readPage(wall)
|
||||
wallKakel = kakel.getKakel(kakelWallHref)
|
||||
kakel.writeToExcel(wallKakel, 'wall')
|
||||
|
||||
kakelfloorWallHref = kakel.readPage(floorWall)
|
||||
floorWallKakel = kakel.getKakel(kakelfloorWallHref)
|
||||
kakel.writeToExcel(floorWallKakel, 'floorWall')
|
||||
|
||||
kakelnaturstenHref = kakel.readPage(natursten)
|
||||
naturstenKakel = kakel.getKakel(kakelnaturstenHref)
|
||||
kakel.writeToExcel(naturstenKakel, 'natursten')
|
||||
|
||||
kakel.workbook.close()
|
||||
Loading…
Reference in New Issue