# coding=utf-8 import urllib2 from bs4 import BeautifulSoup import os import xlsxwriter class ScrapeKakel(object): def __init__(self): self._konradsson_page = "https://konradssons.com" self._proxy_support = urllib2.ProxyHandler({"https": "smipse:hm7gRHmj1986@proxyseso.scania.com:8080"}) self._auth = urllib2.HTTPBasicAuthHandler() self._opener = urllib2.build_opener(self._proxy_support, self._auth, urllib2.HTTPSHandler) urllib2.install_opener(self._opener) self.workbook = xlsxwriter.Workbook('C:\\Users\\smipse\\kakel.xlsx') def readPage(self, urlName): link = self._konradsson_page + urlName page = urllib2.urlopen(link) soup = BeautifulSoup(page, 'html.parser') kakel_box = soup.find_all('div', attrs={'class': 'product-item'}) konrads_link = [] for item in kakel_box: kakel_a= item.find('a') kakel_href = kakel_a.get('href') konrads_link.append(kakel_href) return konrads_link def getKakel(self, hrefFromPage): kakelList = [] for linkKakel in hrefFromPage: link = self._konradsson_page + linkKakel if link == u'sortiment/kakel-och-klinker/golvvagg/granito-arkansas-gra-pavé-8165/': print link page = urllib2.urlopen(link.encode('latin1')) soup = BeautifulSoup(page, 'html.parser') productInfo = soup.find('div', attrs={'class': 'col-md-12'}) left = productInfo.find_all(attrs={"class": 'productFeaturesLeft'}) right = productInfo.find_all(attrs={"class": 'productFeaturesRight'}) if len(left) == len(right): tmpLeft = [] tmpRight = [] for item in left: leftText = item.text.strip() leftText = leftText.replace(u'ä','a') leftText = leftText.replace(':', '') if leftText.startswith('Anta'): leftText = 'Antal' tmpLeft.append(leftText) for item in right: tmpRight.append(item.text.strip()) tmpLeft.append('link') tmpRight.append(link) kakelInfo = dict(zip(tmpLeft,tmpRight)) kakelList.append(kakelInfo) else: print productInfo return kakelList def writeToExcel(self, kakelList, name): worksheet = self.workbook.add_worksheet(name=name) row = 0 colPlace = { 'Serie': 0, 'Priskod': 1, 'Storlek': 2, 'Antal': 3, 'Yta': 4, 'Tjocklek': 5, 'Frostsaker': 6, 'Placering': 7, 'link': 8, 'Farg': 9, 'PEI': 10, 'Plats': 11, 'Art nr': 12, 'Arkstorlek': 13, } for item in kakelList: col = 0 for key,value in item.iteritems(): if row == 0: worksheet.write(row, colPlace[key], key) worksheet.write(row + 1, colPlace[key], value) else: worksheet.write(row, colPlace[key], value) col +=1 if row == 0: row += 2 else: row += 1 if __name__ == "__main__": kakel = ScrapeKakel() wall = '/sortiment/kakel-och-klinker/vagg' floorWall = '/sortiment/kakel-och-klinker/golvvagg/' natursten = '/sortiment/natursten/' kakelWallHref = kakel.readPage(wall) wallKakel = kakel.getKakel(kakelWallHref) kakel.writeToExcel(wallKakel, 'wall') kakelfloorWallHref = kakel.readPage(floorWall) floorWallKakel = kakel.getKakel(kakelfloorWallHref) kakel.writeToExcel(floorWallKakel, 'floorWall') kakelnaturstenHref = kakel.readPage(natursten) naturstenKakel = kakel.getKakel(kakelnaturstenHref) kakel.writeToExcel(naturstenKakel, 'natursten') kakel.workbook.close()