当前位置:网站首页>多线程爬取马可波罗网供应商数据
多线程爬取马可波罗网供应商数据
2022-04-23 05:46:00 【圆滚滚的程序员】
本文旨在交流学习,勿作他用,否则后果自负
环境 linux+pycharm+anaconda
import json
import csv
import random
from queue import Queue
import threading
import requests
from usere_agent import UA
from lxml import etree
HEADER = {
'User-Agent': UA,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
def get_request(url):
try:
response = requests.get(
url=url,
headers=HEADER,
verify=True,
timeout=50
)
return response.text
except Exception as e:
pass
class Img(threading.Thread):
def __init__(self, list_img):
threading.Thread.__init__(self)
self.list_img = list_img
def run(self):
while True:
keys = self.list_img.get()#取key列表里面的元素
self.Get_img(keys)
self.list_img.task_done()#取不到元素的时候自动退出程序
def Get_img(self, key):
try:
n_d = get_request(key)
n_data = etree.HTML(n_d)
good_url = n_data.xpath(
r'.//div[@class="s_product_item"]//div[@class="s_product_pic_box"]/a[@target="_blank"]/@href')
if good_url:
for j in good_url:
good_detali = get_request(j)
goo_deta_data = etree.HTML(good_detali)
title_deta = goo_deta_data.xpath(r'.//div[@class="con_msg f1"]/div[@class="con_title"]/text()')
price = goo_deta_data.xpath(
r'.//div[@class="con_msg f1"]/div[@class="con_price"]/span[@class="price"]/text()')
company_name = goo_deta_data.xpath(
r'.//div[@class="con_msg f1"]//div[@class="con_item"]/ul/li[3]/a[@target="_blank"]/text()')
company_href = goo_deta_data.xpath(
r'.//div[@class="con_msg f1"]//div[@class="con_item"]/ul/li[3]/a[@target="_blank"]/@href')
if company_href:
# print(company_href[0])
company_deta = get_request(company_href[0])
company_deta_data = etree.HTML(company_deta)
contacts = company_deta_data.xpath(r'.//div[@class="item_info"]/ul/li[1]/text()')
phone = company_deta_data.xpath(r'.//div[@class="item_info"]/ul/li[2]/span[2]/text()')
address = company_deta_data.xpath(r'.//div[@class="item_info"]/ul/li[3]/text()')
#print(ti)
with open('/media/liu/_dde_data/project/spider/供应商/mkbl_data/' + ti + '.csv', 'a+') as f:
f_csv = csv.writer(f)
f_csv.writerow([ti,title_deta[0], price[0], company_name[0], company_href[0], contacts[0], phone[0], address[0]])
print(ti, title_deta[0], price[0], company_name[0], company_href[0], contacts[0], phone[0],
address[0])
except Exception as e:
pass
if __name__ == '__main__':
list_img =Queue()
url='http://china.makepolo.com/list/d14/'
d = get_request(url)
data = etree.HTML(d)
href = data.xpath(r'.//div[@class="category clearfix"]//dl//dd//a/@href')
title = data.xpath(r'.//div[@class="category clearfix"]//dl//dd//a/text()')
for ti, h in zip(title, href):
for i in range(1, 101):
n_h = h + '{}/'.format(str(i))
list_img.put(n_h)
for item in range(9):
t = Img(list_img)
t.setDaemon(True)
t.start()
list_img.join()
版权声明
本文为[圆滚滚的程序员]所创,转载请带上原文链接,感谢
https://blog.csdn.net/qq_39483957/article/details/106340594
边栏推荐
猜你喜欢
GDAL+OGR学习
基于Sentinel+Nacos 对Feign Client 动态添加默认熔断规则
[leetcode 67] sum of two binary numbers
1007 go running (hdu6808) in the fourth game of 2020 Hangzhou Electric Multi school competition
檢測技術與原理
The most practical chrome plug-in
Addition, deletion, modification and query of MySQL table
C language file operation
Gesture recognition research
Installation and usage skills of idea
随机推荐
Miscellaneous 1
8. Integer Decomposition
@Problems caused by internal dead loop of postconstruct method
xlsxwriter.exceptions.FileCreateError: [Errno 13] Permission denied问题
Stability building best practices
MySQL occasional Caton
从源代码到可执行文件的过程
小区房价可视化
程序設計訓練
12. Monkeys climb mountains
Sakura substring thinking
Understanding and installing MySQL
Understanding and use of tp50, tp90 and tp99
Explain of MySQL optimization
[leetcode 202] happy number
Animation - Introduction to keyframes
[leetcode 54] spiral matrix
SVN简单操作命令
Collection and map thread safety problem solving
Easy to use data set and open source network comparison website