当前位置:网站首页>word frequency count

word frequency count

2022-04-23 18:00:00 Round programmer

#  Import extension library 
import re #  Regular expression library 
import collections #  Word frequency database 
import numpy as np # numpy Data processing library 
import jieba #  Stuttering participle 
import wordcloud #  Word cloud display library 
from PIL import Image #  Image processing library 
import matplotlib.pyplot as plt #  Image gallery 

#  Read the file 
fn = open('article.txt') #  Open file 
string_data = fn.read() #  Read out the whole file 
fn.close() #  Close file 

#  Text preprocessing 
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|"') #  Define regular expression matching patterns 
string_data = re.sub(pattern, '', string_data) #  Remove the characters that match the pattern 

#  Text participle 
seg_list_exact = jieba.cut(string_data, cut_all = False) #  Precise pattern segmentation 
object_list = []
remove_words = [u' Of ', u',',u' and ', u' yes ', u' With ', u' about ', u' Yes ',u' etc. ',u' can ',u' all ',u'.',u' ',u'、',u' in ',u' stay ',u' 了 ',
                u' Usually ',u' If ',u' We ',u' need '] #  Custom remove Thesaurus 

for word in seg_list_exact: #  Loop through each participle 
    if word not in remove_words: #  If it's not in the lexicon 
        object_list.append(word) #  The participle is appended to the list 

#  Word frequency statistics 
word_counts = collections.Counter(object_list) #  Do word frequency statistics for word segmentation 
word_counts_top10 = word_counts.most_common(10) #  Before acquisition 10 The most frequent words 
print (word_counts_top10) #  Output check 

#  Word frequency display 
mask = np.array(Image.open('wordcloud.jpg')) #  Define the word frequency background 
wc = wordcloud.WordCloud(
    font_path='C:/Windows/Fonts/simhei.ttf', #  Set the font format 
    mask=mask, #  Setting the background 
    max_words=200, #  The maximum number of words displayed 
    max_font_size=100 #  Font maximum 
)

wc.generate_from_frequencies(word_counts) #  Generating word clouds from dictionaries 
image_colors = wordcloud.ImageColorGenerator(mask) #  Create a color scheme from the background image 
wc.recolor(color_func=image_colors) #  Set the color of the word cloud as the background image scheme 
plt.imshow(wc) #  Show word cloud 
plt.axis('off') #  Turn off the axis 
plt.show() #  Display images 

版权声明
本文为[Round programmer]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/04/202204230545315832.html