util/konradsson.py

125 lines
4.0 KiB
Python

# coding=utf-8
import urllib2
from bs4 import BeautifulSoup
import os
import xlsxwriter
class ScrapeKakel(object):
def __init__(self):
self._konradsson_page = "https://konradssons.com"
self._proxy_support = urllib2.ProxyHandler({"https": "smipse:hm7gRHmj1986@proxyseso.scania.com:8080"})
self._auth = urllib2.HTTPBasicAuthHandler()
self._opener = urllib2.build_opener(self._proxy_support, self._auth, urllib2.HTTPSHandler)
urllib2.install_opener(self._opener)
self.workbook = xlsxwriter.Workbook('C:\\Users\\smipse\\kakel.xlsx')
def readPage(self, urlName):
link = self._konradsson_page + urlName
page = urllib2.urlopen(link)
soup = BeautifulSoup(page, 'html.parser')
kakel_box = soup.find_all('div', attrs={'class': 'product-item'})
konrads_link = []
for item in kakel_box:
kakel_a= item.find('a')
kakel_href = kakel_a.get('href')
konrads_link.append(kakel_href)
return konrads_link
def getKakel(self, hrefFromPage):
kakelList = []
for linkKakel in hrefFromPage:
link = self._konradsson_page + linkKakel
if link == u'sortiment/kakel-och-klinker/golvvagg/granito-arkansas-gra-pavé-8165/':
print link
page = urllib2.urlopen(link.encode('latin1'))
soup = BeautifulSoup(page, 'html.parser')
productInfo = soup.find('div', attrs={'class': 'col-md-12'})
left = productInfo.find_all(attrs={"class": 'productFeaturesLeft'})
right = productInfo.find_all(attrs={"class": 'productFeaturesRight'})
if len(left) == len(right):
tmpLeft = []
tmpRight = []
for item in left:
leftText = item.text.strip()
leftText = leftText.replace(u'ä','a')
leftText = leftText.replace(':', '')
if leftText.startswith('Anta'):
leftText = 'Antal'
tmpLeft.append(leftText)
for item in right:
tmpRight.append(item.text.strip())
tmpLeft.append('link')
tmpRight.append(link)
kakelInfo = dict(zip(tmpLeft,tmpRight))
kakelList.append(kakelInfo)
else:
print productInfo
return kakelList
def writeToExcel(self, kakelList, name):
worksheet = self.workbook.add_worksheet(name=name)
row = 0
colPlace = {
'Serie': 0,
'Priskod': 1,
'Storlek': 2,
'Antal': 3,
'Yta': 4,
'Tjocklek': 5,
'Frostsaker': 6,
'Placering': 7,
'link': 8,
'Farg': 9,
'PEI': 10,
'Plats': 11,
'Art nr': 12,
'Arkstorlek': 13,
}
for item in kakelList:
col = 0
for key,value in item.iteritems():
if row == 0:
worksheet.write(row, colPlace[key], key)
worksheet.write(row + 1, colPlace[key], value)
else:
worksheet.write(row, colPlace[key], value)
col +=1
if row == 0:
row += 2
else:
row += 1
if __name__ == "__main__":
kakel = ScrapeKakel()
wall = '/sortiment/kakel-och-klinker/vagg'
floorWall = '/sortiment/kakel-och-klinker/golvvagg/'
natursten = '/sortiment/natursten/'
kakelWallHref = kakel.readPage(wall)
wallKakel = kakel.getKakel(kakelWallHref)
kakel.writeToExcel(wallKakel, 'wall')
kakelfloorWallHref = kakel.readPage(floorWall)
floorWallKakel = kakel.getKakel(kakelfloorWallHref)
kakel.writeToExcel(floorWallKakel, 'floorWall')
kakelnaturstenHref = kakel.readPage(natursten)
naturstenKakel = kakel.getKakel(kakelnaturstenHref)
kakel.writeToExcel(naturstenKakel, 'natursten')
kakel.workbook.close()