From 824e2b5462f4789dccf39def7a694c66f649097d Mon Sep 17 00:00:00 2001 From: Simon Milvert Date: Sat, 16 Feb 2019 22:04:57 +0100 Subject: [PATCH] Add scrape konradsson --- konradsson.py | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 konradsson.py diff --git a/konradsson.py b/konradsson.py new file mode 100644 index 0000000..9b3b6ae --- /dev/null +++ b/konradsson.py @@ -0,0 +1,124 @@ + # coding=utf-8 +import urllib2 +from bs4 import BeautifulSoup +import os +import xlsxwriter + + +class ScrapeKakel(object): + + def __init__(self): + self._konradsson_page = "https://konradssons.com" + self._proxy_support = urllib2.ProxyHandler({"https": "smipse:hm7gRHmj1986@proxyseso.scania.com:8080"}) + self._auth = urllib2.HTTPBasicAuthHandler() + self._opener = urllib2.build_opener(self._proxy_support, self._auth, urllib2.HTTPSHandler) + urllib2.install_opener(self._opener) + self.workbook = xlsxwriter.Workbook('C:\\Users\\smipse\\kakel.xlsx') + + def readPage(self, urlName): + link = self._konradsson_page + urlName + page = urllib2.urlopen(link) + soup = BeautifulSoup(page, 'html.parser') + + kakel_box = soup.find_all('div', attrs={'class': 'product-item'}) + + konrads_link = [] + for item in kakel_box: + kakel_a= item.find('a') + kakel_href = kakel_a.get('href') + konrads_link.append(kakel_href) + + return konrads_link + + def getKakel(self, hrefFromPage): + + kakelList = [] + + for linkKakel in hrefFromPage: + link = self._konradsson_page + linkKakel + if link == u'sortiment/kakel-och-klinker/golvvagg/granito-arkansas-gra-pavé-8165/': + print link + page = urllib2.urlopen(link.encode('latin1')) + soup = BeautifulSoup(page, 'html.parser') + productInfo = soup.find('div', attrs={'class': 'col-md-12'}) + left = productInfo.find_all(attrs={"class": 'productFeaturesLeft'}) + right = productInfo.find_all(attrs={"class": 'productFeaturesRight'}) + + if len(left) == len(right): + tmpLeft = [] + tmpRight = [] + for item in left: + leftText = item.text.strip() + leftText = leftText.replace(u'ä','a') + leftText = leftText.replace(':', '') + if leftText.startswith('Anta'): + leftText = 'Antal' + + tmpLeft.append(leftText) + for item in right: + tmpRight.append(item.text.strip()) + tmpLeft.append('link') + tmpRight.append(link) + kakelInfo = dict(zip(tmpLeft,tmpRight)) + kakelList.append(kakelInfo) + else: + print productInfo + + return kakelList + + def writeToExcel(self, kakelList, name): + + worksheet = self.workbook.add_worksheet(name=name) + row = 0 + colPlace = { + 'Serie': 0, + 'Priskod': 1, + 'Storlek': 2, + 'Antal': 3, + 'Yta': 4, + 'Tjocklek': 5, + 'Frostsaker': 6, + 'Placering': 7, + 'link': 8, + 'Farg': 9, + 'PEI': 10, + 'Plats': 11, + 'Art nr': 12, + 'Arkstorlek': 13, + } + + for item in kakelList: + col = 0 + for key,value in item.iteritems(): + if row == 0: + worksheet.write(row, colPlace[key], key) + worksheet.write(row + 1, colPlace[key], value) + else: + worksheet.write(row, colPlace[key], value) + col +=1 + if row == 0: + row += 2 + else: + row += 1 + +if __name__ == "__main__": + + + kakel = ScrapeKakel() + wall = '/sortiment/kakel-och-klinker/vagg' + floorWall = '/sortiment/kakel-och-klinker/golvvagg/' + natursten = '/sortiment/natursten/' + + kakelWallHref = kakel.readPage(wall) + wallKakel = kakel.getKakel(kakelWallHref) + kakel.writeToExcel(wallKakel, 'wall') + + kakelfloorWallHref = kakel.readPage(floorWall) + floorWallKakel = kakel.getKakel(kakelfloorWallHref) + kakel.writeToExcel(floorWallKakel, 'floorWall') + + kakelnaturstenHref = kakel.readPage(natursten) + naturstenKakel = kakel.getKakel(kakelnaturstenHref) + kakel.writeToExcel(naturstenKakel, 'natursten') + + kakel.workbook.close()