当前位置:网站首页>爬取手游网站游戏详情和评论(MQ+多线程)
爬取手游网站游戏详情和评论(MQ+多线程)
2022-04-23 05:46:00 【圆滚滚的程序员】
本文旨在交流学习,勿作他用,否则后果自负
环境 linux+pycharm+anaconda
import re
import requests
import random
import json
import threading
from queue import Queue
import time
from lxml import etree
#生产者
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
HEADER = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
class Productthread(threading.Thread):
#继承父类的init,初始化子类参数:
def __init__(self, name, page_queue):
# threading.Thread.__init__(self)
super().__init__()
self.name = name
self.q = page_queue
def run(self):
#防止偷懒,只干一个任务就退出;
while True:
if self.q.empty():
break
else:
try:
key = self.q.get(block=False)
game_r = requests.get(url=key, headers=HEADER, timeout=50)
game_res = bytes(game_r.text, game_r.encoding).decode('utf-8', 'ignore')
response_queue.put(game_res)
except:
pass
#消费者:解析页面
class Consumerthread(threading.Thread):
#继承父类的init,初始化子类参数:
def __init__(self, name):
super().__init__()
self.name = name
def run(self):
#防止偷懒,只干一个任务就退出;
while True:
#当response任务队列为空,并且生产者都死了,没有人再生产response,退出
if response_queue.empty() and flag:
break
else:
try:
#(1)获取任务
response = response_queue.get()
if response[0]=='{':
respon=json.loads(response)
comment_list=respon.get('content','notfond')
for j in comment_list:
id_list=j['id']
username_list=j['username']
uid_list=j['uid']
img_list="https:"+j['avatar']
timeStamp = int(j['timeu'])
timeArray = time.localtime(timeStamp)
time_list = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
comment=j['comment']
good_num=j['good_num']
num=j['num']
print(id_list,username_list,uid_list,img_list,time_list,comment,good_num,num)
else:
game_d = etree.HTML(response)
name_list = game_d.xpath(
r"/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[1]/h1/text()") # 名字
logo_list = game_d.xpath(r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/img/@src') # 图
introduce_list = game_d.xpath(
r'.//div[@class="txtArea"]/div[@class="txtCon"]/div[not(@class or @id)]/text()') # 介绍
score_list = game_d.xpath(
r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div/p[2]/text()') # 评分
comment_count_list = game_d.xpath(
r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div/p[4]/text()') # 评论人数
if logo_list[0]=="h":
logo=logo_list[0]
else:
logo='https:'+logo_list[0]
print(name_list[0], id_list[0],logo, introduce_list[0], score_list[0], comment_count_list[0])
except:
pass
if __name__ == '__main__':
#线程锁:
lock = threading.Lock()
##1.消费者队列:
response_queue = Queue()
flag = False
start_time = time.time()
#2.创建生产者任务队列:
page_queue = Queue()
try:
url = 'https://www.3839.com/top/hot.html'
r = requests.get(url=url, headers=HEADER, timeout=50)
res = bytes(r.text, r.encoding).decode('utf-8', 'ignore')
data = etree.HTML(res)
game_url_list = data.xpath('/html/body/div[1]/div[4]/ul/li/a/@href')
for i in game_url_list:
game_url = 'https:' + i
page_queue.put(game_url)
id_list = re.compile(r'//www.3839.com/a/(.*).htm').findall(i) # id
for k in range(1, 100):
xq_url = 'https://www.3839.com/cdn/comment/view_v2-ac-json-pid-1-fid-{}-p-{}-order-1-htmlsafe-1-urltype-1-audit-1.htm'.format(id_list[0],k)
page_queue.put(xq_url)
#3.起生产者线程:
crawl_name = str([i for i in range(1,16)])
pthread_list = []
for name in crawl_name:
crawl = Productthread(name, page_queue)
crawl.start()
pthread_list.append(crawl)
#4.起消费者线程:
consumer_name = str([i for i in range(1,15)])
cthread_list =[]
for name in consumer_name:
crawl = Consumerthread(name)
crawl.start()
cthread_list.append(crawl)
#阻塞主线程
for thread in pthread_list:
thread.join()
flag = True #表示生产者线程都死光了:
for thread in cthread_list:
thread.join()
end_time = time.time()
except:
pass
版权声明
本文为[圆滚滚的程序员]所创,转载请带上原文链接,感谢
https://blog.csdn.net/qq_39483957/article/details/106729700
边栏推荐
猜你喜欢

Addition, deletion, modification and query of MySQL table

基于pygame库编写的五子棋游戏

Delete and truncate

Algèbre linéaire chapitre 1 - déterminants
![[untitled] database - limit the number of returned rows](/img/20/9a143e6972f1ce2eed5a3d11c3a46d.png)
[untitled] database - limit the number of returned rows

Import of data
scikit-learn sklearn 0.18 官方文档中文版

SQL injection

Create binary tree

Installation and usage skills of idea
随机推荐
Generation of verification code
Best practices for MySQL storage time
斯坦福机器学习课程汇总
Detection technology and principle
C language file operation
[leetcode 228] summary interval
GDAL+OGR学习
Gesture recognition research
How to grow at work
11.a==b?
2. Devops sonar installation
IO multiplexing of 09 redis
自动控制原理知识点整合归纳(韩敏版)
[leetcode169] most elements
Algèbre linéaire chapitre 2 - matrice et son fonctionnement
scikit-learn sklearn 0.18 官方文档中文版
Plane semi intersecting plate
Protected (members modified by protected are visible to this package and its subclasses)
Optional best practices
Kalman filter and inertial integrated navigation