当前位置:网站首页>Crawling mobile game website game details and comments (MQ + multithreading)
Crawling mobile game website game details and comments (MQ + multithreading)
2022-04-23 18:00:00 【Round programmer】
This paper aims to exchange learning , Don't use it for other purposes , Otherwise, we will be responsible for the consequences
Environmental Science linux+pycharm+anaconda
import re
import requests
import random
import json
import threading
from queue import Queue
import time
from lxml import etree
# producer
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
HEADER = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
}
class Productthread(threading.Thread):
# Inherited from the parent class init, Initialize subclass parameters :
def __init__(self, name, page_queue):
# threading.Thread.__init__(self)
super().__init__()
self.name = name
self.q = page_queue
def run(self):
# Prevent laziness , Just do one task and quit ;
while True:
if self.q.empty():
break
else:
try:
key = self.q.get(block=False)
game_r = requests.get(url=key, headers=HEADER, timeout=50)
game_res = bytes(game_r.text, game_r.encoding).decode('utf-8', 'ignore')
response_queue.put(game_res)
except:
pass
# consumer : Parsing the page
class Consumerthread(threading.Thread):
# Inherited from the parent class init, Initialize subclass parameters :
def __init__(self, name):
super().__init__()
self.name = name
def run(self):
# Prevent laziness , Just do one task and quit ;
while True:
# When response Task queue is empty , And the producers are dead , No one reproduced response, sign out
if response_queue.empty() and flag:
break
else:
try:
#(1) Access to task
response = response_queue.get()
if response[0]=='{':
respon=json.loads(response)
comment_list=respon.get('content','notfond')
for j in comment_list:
id_list=j['id']
username_list=j['username']
uid_list=j['uid']
img_list="https:"+j['avatar']
timeStamp = int(j['timeu'])
timeArray = time.localtime(timeStamp)
time_list = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
comment=j['comment']
good_num=j['good_num']
num=j['num']
print(id_list,username_list,uid_list,img_list,time_list,comment,good_num,num)
else:
game_d = etree.HTML(response)
name_list = game_d.xpath(
r"/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[1]/h1/text()") # name
logo_list = game_d.xpath(r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/img/@src') # chart
introduce_list = game_d.xpath(
r'.//div[@class="txtArea"]/div[@class="txtCon"]/div[not(@class or @id)]/text()') # Introduce
score_list = game_d.xpath(
r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div/p[2]/text()') # score
comment_count_list = game_d.xpath(
r'/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div/p[4]/text()') # Number of comments
if logo_list[0]=="h":
logo=logo_list[0]
else:
logo='https:'+logo_list[0]
print(name_list[0], id_list[0],logo, introduce_list[0], score_list[0], comment_count_list[0])
except:
pass
if __name__ == '__main__':
# Thread lock :
lock = threading.Lock()
##1. Consumer queue :
response_queue = Queue()
flag = False
start_time = time.time()
#2. Create a producer task queue :
page_queue = Queue()
try:
url = 'https://www.3839.com/top/hot.html'
r = requests.get(url=url, headers=HEADER, timeout=50)
res = bytes(r.text, r.encoding).decode('utf-8', 'ignore')
data = etree.HTML(res)
game_url_list = data.xpath('/html/body/div[1]/div[4]/ul/li/a/@href')
for i in game_url_list:
game_url = 'https:' + i
page_queue.put(game_url)
id_list = re.compile(r'//www.3839.com/a/(.*).htm').findall(i) # id
for k in range(1, 100):
xq_url = 'https://www.3839.com/cdn/comment/view_v2-ac-json-pid-1-fid-{}-p-{}-order-1-htmlsafe-1-urltype-1-audit-1.htm'.format(id_list[0],k)
page_queue.put(xq_url)
#3. Start producer thread :
crawl_name = str([i for i in range(1,16)])
pthread_list = []
for name in crawl_name:
crawl = Productthread(name, page_queue)
crawl.start()
pthread_list.append(crawl)
#4. Start the consumer thread :
consumer_name = str([i for i in range(1,15)])
cthread_list =[]
for name in consumer_name:
crawl = Consumerthread(name)
crawl.start()
cthread_list.append(crawl)
# Block main thread
for thread in pthread_list:
thread.join()
flag = True # Indicates that all producer threads are dead :
for thread in cthread_list:
thread.join()
end_time = time.time()
except:
pass
版权声明
本文为[Round programmer]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/04/202204230545315975.html
边栏推荐
- 2022 Jiangxi energy storage technology exhibition, China Battery exhibition, power battery exhibition and fuel cell Exhibition
- Land cover / use data product download
- Transfer learning of five categories of pictures based on VGg
- Scikit learn sklearn 0.18 official document Chinese version
- positioner
- _ FindText error
- Clion installation tutorial
- Examination question bank and online simulation examination of the third batch (main person in charge) of special operation certificate of safety officer a certificate in Guangdong Province in 2022
- Oil monkey website address
- Client example analysis of easymodbustcp
猜你喜欢
C network related operations
Scikit learn sklearn 0.18 official document Chinese version
Auto.js 自定义对话框
2022江西光伏展,中國分布式光伏展會,南昌太陽能利用展
Welcome to the markdown editor
Land cover / use data product download
Clion installation tutorial
[UDS unified diagnostic service] (Supplement) v. detailed explanation of ECU bootloader development points (1)
2022 Jiangxi Photovoltaic Exhibition, China distributed Photovoltaic Exhibition, Nanchang solar energy utilization Exhibition
Using files to save data (C language)
随机推荐
.105Location
Use of list - addition, deletion, modification and query
Svn simple operation command
.105Location
C#的随机数生成
[UDS unified diagnostic service] IV. typical diagnostic service (4) - online programming function unit (0x34-0x38)
Fashion classification case based on keras
Go's gin framework learning
Selenium + phantom JS crack sliding verification 2
On the problem of V-IF display and hiding
Scikit learn sklearn 0.18 official document Chinese version
Remember using Ali Font Icon Library for the first time
Client example analysis of easymodbustcp
Summary of floating point double precision, single precision and half precision knowledge
Generate verification code
2022江西光伏展,中国分布式光伏展会,南昌太阳能利用展
String function in MySQL
20222 return to the workplace
JS high frequency interview questions
ROS package NMEA_ navsat_ Driver reads GPS and Beidou Positioning Information Notes