125 lines
4.0 KiB
Python
125 lines
4.0 KiB
Python
# coding=utf-8
|
|
import urllib2
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
import xlsxwriter
|
|
|
|
|
|
class ScrapeKakel(object):
|
|
|
|
def __init__(self):
|
|
self._konradsson_page = "https://konradssons.com"
|
|
self._proxy_support = urllib2.ProxyHandler({"https": "smipse:hm7gRHmj1986@proxyseso.scania.com:8080"})
|
|
self._auth = urllib2.HTTPBasicAuthHandler()
|
|
self._opener = urllib2.build_opener(self._proxy_support, self._auth, urllib2.HTTPSHandler)
|
|
urllib2.install_opener(self._opener)
|
|
self.workbook = xlsxwriter.Workbook('C:\\Users\\smipse\\kakel.xlsx')
|
|
|
|
def readPage(self, urlName):
|
|
link = self._konradsson_page + urlName
|
|
page = urllib2.urlopen(link)
|
|
soup = BeautifulSoup(page, 'html.parser')
|
|
|
|
kakel_box = soup.find_all('div', attrs={'class': 'product-item'})
|
|
|
|
konrads_link = []
|
|
for item in kakel_box:
|
|
kakel_a= item.find('a')
|
|
kakel_href = kakel_a.get('href')
|
|
konrads_link.append(kakel_href)
|
|
|
|
return konrads_link
|
|
|
|
def getKakel(self, hrefFromPage):
|
|
|
|
kakelList = []
|
|
|
|
for linkKakel in hrefFromPage:
|
|
link = self._konradsson_page + linkKakel
|
|
if link == u'sortiment/kakel-och-klinker/golvvagg/granito-arkansas-gra-pavé-8165/':
|
|
print link
|
|
page = urllib2.urlopen(link.encode('latin1'))
|
|
soup = BeautifulSoup(page, 'html.parser')
|
|
productInfo = soup.find('div', attrs={'class': 'col-md-12'})
|
|
left = productInfo.find_all(attrs={"class": 'productFeaturesLeft'})
|
|
right = productInfo.find_all(attrs={"class": 'productFeaturesRight'})
|
|
|
|
if len(left) == len(right):
|
|
tmpLeft = []
|
|
tmpRight = []
|
|
for item in left:
|
|
leftText = item.text.strip()
|
|
leftText = leftText.replace(u'ä','a')
|
|
leftText = leftText.replace(':', '')
|
|
if leftText.startswith('Anta'):
|
|
leftText = 'Antal'
|
|
|
|
tmpLeft.append(leftText)
|
|
for item in right:
|
|
tmpRight.append(item.text.strip())
|
|
tmpLeft.append('link')
|
|
tmpRight.append(link)
|
|
kakelInfo = dict(zip(tmpLeft,tmpRight))
|
|
kakelList.append(kakelInfo)
|
|
else:
|
|
print productInfo
|
|
|
|
return kakelList
|
|
|
|
def writeToExcel(self, kakelList, name):
|
|
|
|
worksheet = self.workbook.add_worksheet(name=name)
|
|
row = 0
|
|
colPlace = {
|
|
'Serie': 0,
|
|
'Priskod': 1,
|
|
'Storlek': 2,
|
|
'Antal': 3,
|
|
'Yta': 4,
|
|
'Tjocklek': 5,
|
|
'Frostsaker': 6,
|
|
'Placering': 7,
|
|
'link': 8,
|
|
'Farg': 9,
|
|
'PEI': 10,
|
|
'Plats': 11,
|
|
'Art nr': 12,
|
|
'Arkstorlek': 13,
|
|
}
|
|
|
|
for item in kakelList:
|
|
col = 0
|
|
for key,value in item.iteritems():
|
|
if row == 0:
|
|
worksheet.write(row, colPlace[key], key)
|
|
worksheet.write(row + 1, colPlace[key], value)
|
|
else:
|
|
worksheet.write(row, colPlace[key], value)
|
|
col +=1
|
|
if row == 0:
|
|
row += 2
|
|
else:
|
|
row += 1
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
kakel = ScrapeKakel()
|
|
wall = '/sortiment/kakel-och-klinker/vagg'
|
|
floorWall = '/sortiment/kakel-och-klinker/golvvagg/'
|
|
natursten = '/sortiment/natursten/'
|
|
|
|
kakelWallHref = kakel.readPage(wall)
|
|
wallKakel = kakel.getKakel(kakelWallHref)
|
|
kakel.writeToExcel(wallKakel, 'wall')
|
|
|
|
kakelfloorWallHref = kakel.readPage(floorWall)
|
|
floorWallKakel = kakel.getKakel(kakelfloorWallHref)
|
|
kakel.writeToExcel(floorWallKakel, 'floorWall')
|
|
|
|
kakelnaturstenHref = kakel.readPage(natursten)
|
|
naturstenKakel = kakel.getKakel(kakelnaturstenHref)
|
|
kakel.writeToExcel(naturstenKakel, 'natursten')
|
|
|
|
kakel.workbook.close()
|