当前位置:网站首页>Crawling mobile game website game details and comments (MQ + multithreading)
Crawling mobile game website game details and comments (MQ + multithreading)
2022-04-23 18:00:00 【Round programmer】
This paper aims to exchange learning , Don't use it for other purposes , Otherwise, we will be responsible for the consequences
Environmental Science linux+pycharm+anaconda
import re
import requests
import random
import json
import threading
from queue import Queue
import time
from lxml import etree
# producer
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
HEADER = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
class Productthread(threading.Thread):
# Inherited from the parent class init, Initialize subclass parameters :
def __init__(self, name, page_queue):
# threading.Thread.__init__(self)
super().__init__()
self.name = name
self.q = page_queue
def run(self):
# Prevent laziness , Just do one task and quit ;
while True:
if self.q.empty():
break
else:
try:
key = self.q.get(block=False)
game_r = requests.get(url=key, headers=HEADER, timeout=50)
game_res = bytes(game_r.text, game_r.encoding).decode('utf-8', 'ignore')
response_queue.put(game_res)
except:
pass
# consumer : Parsing the page
class Consumerthread(threading.Thread):
# Inherited from the parent class init, Initialize subclass parameters :
def __init__(self, name):
super().__init__()
self.name = name
def run(self):
# Prevent laziness , Just do one task and quit ;
while True:
# When response Task queue is empty , And the producers are dead , No one reproduced response, sign out
if response_queue.empty() and flag:
break
else:
try:
#(1) Access to task
response = response_queue.get()
if response[0]=='{':
respon=json.loads(response)
comment_list=respon.get('content','notfond')
for j in comment_list:
id_list=j['id']
username_list=j['username']
uid_list=j['uid']
img_list="https:"+j['avatar']
timeStamp = int(j['timeu'])
timeArray = time.localtime(timeStamp)
time_list = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
comment=j['comment']
good_num=j['good_num']
num=j['num']
print(id_list,username_list,uid_list,img_list,time_list,comment,good_num,num)
else:
game_d = etree.HTML(response)
name_list = game_d.xpath(
r"/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[1]/h1/text()") # name
logo_list = game_d.xpath(r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/img/@src') # chart
introduce_list = game_d.xpath(
r'.//div[@class="txtArea"]/div[@class="txtCon"]/div[not(@class or @id)]/text()') # Introduce
score_list = game_d.xpath(
r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div/p[2]/text()') # score
comment_count_list = game_d.xpath(
r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div/p[4]/text()') # Number of comments
if logo_list[0]=="h":
logo=logo_list[0]
else:
logo='https:'+logo_list[0]
print(name_list[0], id_list[0],logo, introduce_list[0], score_list[0], comment_count_list[0])
except:
pass
if __name__ == '__main__':
# Thread lock :
lock = threading.Lock()
##1. Consumer queue :
response_queue = Queue()
flag = False
start_time = time.time()
#2. Create a producer task queue :
page_queue = Queue()
try:
url = 'https://www.3839.com/top/hot.html'
r = requests.get(url=url, headers=HEADER, timeout=50)
res = bytes(r.text, r.encoding).decode('utf-8', 'ignore')
data = etree.HTML(res)
game_url_list = data.xpath('/html/body/div[1]/div[4]/ul/li/a/@href')
for i in game_url_list:
game_url = 'https:' + i
page_queue.put(game_url)
id_list = re.compile(r'//www.3839.com/a/(.*).htm').findall(i) # id
for k in range(1, 100):
xq_url = 'https://www.3839.com/cdn/comment/view_v2-ac-json-pid-1-fid-{}-p-{}-order-1-htmlsafe-1-urltype-1-audit-1.htm'.format(id_list[0],k)
page_queue.put(xq_url)
#3. Start producer thread :
crawl_name = str([i for i in range(1,16)])
pthread_list = []
for name in crawl_name:
crawl = Productthread(name, page_queue)
crawl.start()
pthread_list.append(crawl)
#4. Start the consumer thread :
consumer_name = str([i for i in range(1,15)])
cthread_list =[]
for name in consumer_name:
crawl = Consumerthread(name)
crawl.start()
cthread_list.append(crawl)
# Block main thread
for thread in pthread_list:
thread.join()
flag = True # Indicates that all producer threads are dead :
for thread in cthread_list:
thread.join()
end_time = time.time()
except:
pass
版权声明
本文为[Round programmer]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/04/202204230545315975.html
边栏推荐
- 云原生虚拟化:基于 Kubevirt 构建边缘计算实例
- Error in created hook: "referenceerror:" promise "undefined“
- YOLOv4剪枝【附代码】
- Click Cancel to return to the previous page and modify the parameter value of the previous page, let pages = getcurrentpages() let prevpage = pages [pages. Length - 2] / / the data of the previous pag
- Element calculation distance and event object
- Client example analysis of easymodbustcp
- re正則錶達式
- JS implementation private attribute
- Thirteen documents in software engineering
- I/O多路复用及其相关详解
猜你喜欢
Data stream encryption and decryption of C
云原生虚拟化:基于 Kubevirt 构建边缘计算实例
Random number generation of C #
cv_ Solution of mismatch between bridge and opencv
2022 judgment questions and answers for operation of refrigeration and air conditioning equipment
The ultimate experience, the audio and video technology behind the tiktok
Go的Gin框架学习
Auto. JS custom dialog box
[appium] write scripts by designing Keyword Driven files
re正则表达式
随机推荐
The method of changing a value in the array and a value in the object of wechat applet
Solving the problem of displaying too many unique values in ArcGIS partition statistics failed
Client example analysis of easymodbustcp
MySQL_01_简单数据检索
Error in created hook: "referenceerror:" promise "undefined“
Classification of cifar100 data set based on convolutional neural network
C1小笔记【任务训练篇一】
On the method of outputting the complete name of typeID from GCC
How to read literature
Re regular expression
Go language JSON package usage
C1小笔记【任务训练篇二】
云原生虚拟化:基于 Kubevirt 构建边缘计算实例
2022 Jiangxi Photovoltaic Exhibition, China distributed Photovoltaic Exhibition, Nanchang solar energy utilization Exhibition
Calculation of fishing net road density
On the problem of V-IF display and hiding
开源按键组件Multi_Button的使用,含测试工程
JS forms the items with the same name in the array object into the same array according to the name
Land cover / use data product download
Implementation of object detection case based on SSD