1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
|
""" ------------------------------------------------
describe: 用来抓取指定的物流公司官网的信息,包含title、keywords、description。
usage: python comp_infos_grab.py
base_info: __version__ = "v.10" __author__ = "PyGo" __time__ = "2019/12/3" __mail__ = "gaoming971366@163.com"
------------------------------------------------ """ import requests import gevent import xlrd import xlwt from gevent import monkey; monkey.patch_all() from bs4 import BeautifulSoup import jieba
PUBLIC_URL_LIST = { "IML俄罗斯海外仓": "http://www.imlb2c.com/", "旺集科技": "http://www.wangjigroup.com/", "黑龙江俄速通国际物流有限公司": "http://www.ruston.cc/", "AliExpress全球速卖通": "https://sell.aliexpress.com/zh/__pc/shipping/aliexpress_shipping.htm", "中外运集装箱运输有限公司": "http://www.sinolines.com/", "乐泰国际物流有限公司": "http://www.letaimzl.com/", "NOEL诺艾尔集团": "http://www.noelworld.com/", "慧承国际物流": "http://www.hcwuliu.com/", "满洲里新颖国际货运代理有限公司": "http://www.mzlxinying.com/", "运盟国际物流": "http://www.ym-trans.com/", "如易科技": "http://www.ruecom.cn/" }
class companyGrap(object): _instance = None
def __init__(self): super(companyGrap, self).__init__() def __new__(cls, *args, **kwargs): if companyGrap._instance is None: companyGrap._instance = object.__new__(cls, *args, **kwargs) return companyGrap._instance def _get_infos(self, url): results = dict() results['url'] = url if not url: return results payload = "" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36" } response = requests.get(url, data=payload, headers=headers) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') head = soup.head titles = head.find_all('title') tl = titles[0].string if titles else "" results['title'] = tl keywords = head.find_all('meta', attrs={'name': 'keywords'}) kw = keywords[0].attrs.get('content') if keywords else "" results['keyword'] = kw descriptions = head.find_all('meta', attrs={'name': 'description'}) desc = descriptions[0].attrs.get('content') if descriptions else "" results['description'] = desc return results def to_excel(self, datas, exlname): """ generate data of excel format to save :param datas: excel data :param exlname: excel name :return: None, excel data """ f = xlwt.Workbook(encoding='utf-8') sheet = f.add_sheet('sheet', cell_overwrite_ok=True) EXCEL_TITLES = ["ID", "NAME", "URL", 'TITLE', 'KEYWORDS', 'DESCRIPTION', "REMARK"] BUSINESS = "BUSINESS" style_title = xlwt.XFStyle() font = xlwt.Font() font.name = 'Times New Roman' font.bold = True font.color_index = 4 font.height = 220 style_title.font = font style_content = xlwt.XFStyle() font = xlwt.Font() font.name = 'Times New Roman' font.bold = False font.color_index = 4 font.height = 220 style_content.font = font for i in range(0, len(EXCEL_TITLES)): sheet.write(0, i, EXCEL_TITLES[i], style_title) sheet.write_merge(0, 0, 3, 5, BUSINESS, style_title) sheet.write_merge(0, 1, 0, 0, 'ID', style_title) sheet.write_merge(0, 1, 1, 1, 'NAME', style_title) sheet.write_merge(0, 1, 2, 2, 'URL', style_title) sheet.write_merge(0, 1, 6, 6, 'REMARK', style_title) for i in range(3, 6): sheet.write(1, i, EXCEL_TITLES[i], style_content) row = 2 count = 1 for line in datas: sheet.write(row, 0, count, style_title) sheet.write(row, 1, line.get('name'), style_content) sheet.write(row, 2, line.get('url'), style_content) sheet.write(row, 3, line.get('title'), style_content) sheet.write(row, 4, line.get('keyword'), style_content) sheet.write(row, 5, line.get('description'), style_content) row += 1 count += 1 f.save(exlname) def _deal_url(self, k, v): return self._get_infos(v) def to_generate_kw(self, datas): keywords_src = "" for data in datas: if not data: continue keywords_src += data.get('keyword') keywords = jieba.lcut(keywords_src, cut_all=False) counts = dict() for word in keywords: if not word: continue if isinstance(word, unicode): word = word.encode('utf-8') if word in ('|', ',', ' ', '-', ','): continue counts[word] = counts.get(word, 0) + 1 ord_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True) for k in ord_counts: print "%s: %s" % (k[0], k[1]) def run(self, to_excel=False): """ process run :param to_excel: :return: """ jobs = list() names = list() excel_datas = list() for k, v in PUBLIC_URL_LIST.iteritems(): if not k or not v: continue names.append(k) jobs.append(gevent.spawn(self._deal_url, k, v)) gevent.joinall(jobs) for name, job in zip(names, jobs): value = job.value print '==================%s==================' % name print 'Title: %s' % value.get('title') print 'Keyword: %s' % value.get('keyword') print 'Description: %s' % value.get('description') value['name'] = name excel_datas.append(value) self.to_generate_kw(excel_datas) if to_excel: print '---------excel ok' self.to_excel(excel_datas, 'companys.xls')
if __name__ == '__main__': companyGrap().run(to_excel=False)
|