当前位置：网站首页>K-means clustering based on word2vec

K-means clustering based on word2vec

2022-04-21 14:04:00 【ddy-ddy】

1. take txt Text with word2vec Convert each word into a word vector
2. take 300 The word vector of dimension is pca Convert to 2 dimension
3. take 2 Dimensional data as k-means Input of clustering

text.txt： As a training text （ Best in English , If it is in Chinese, you can use jieba The library parses Chinese ）
word_model.txt： Create an empty text
data.csv： Create an empty csv file

#1. Replace the punctuation of the text with a space 
import re
import os
list=[',','?','.','?','!','*','(',')','“','”',':','"','`','\'']  ## Make a list of punctuation marks to be replaced 
with open('text.txt','r') as f:                                  ##text.txt It's a text for training （ English novels ）
    result = f.read()
    for i in range(len(list)):
        result=result.replace(list[i],' ')
    with open('text.txt','w') as w:
        w.write(str(result))



##2.wordvec2 Get the word vector 
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def wordsCluster(text, vectorSize):                              ##text: Enter the local path of the text  vectorSize： Word vector size （ How many dimensions ）
    name = []
    data = open(text, 'r', encoding='utf-8')
    for line in data.readlines():
        line = line.replace('\n', '')
        if line not in name:
            name.append(line)
    # word2vec To quantify 
    model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
    model.wv.save_word2vec_format('word_model.txt', binary=False)      ## Save the word vector in word_model.txt In the text 

    #  obtain model All the keywords in it 
    keys = model.wv.vocab.keys()
wordsCluster('text.txt',300)



##3. The vector containing the word txt The text is converted to csv Text 
f = open("word_model.txt","r")
new=[]
for line in f:
    new.append(line)
new[0]='\n'
f.close()

f = open("word_model.txt","w")
for n in new:
    f.write(n)
f.close()

import csv
with open('data.csv', 'w', newline='') as csvfile:      ##data.csv Is used to store word vectors csv file 
    writer = csv.writer(csvfile)
    data = open('word_model.txt')
    for each_line in data:
        a = each_line.split()
        writer.writerow(a)

##4. use pca take 300 Dimensional data is reduced to 2 dimension 
# coding=utf-8
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
l = []
words=[]
with open('data.csv', 'r') as fd:
    line = fd.readline()
    line=fd.readline()
    while line:
        if line == "":
            continue
        line = line.strip()
        word = line.split(",")
        words.append(word[0])
        l.append(word[1:])
        line = fd.readline()

X = np.array(l)  # Import data , Dimension for 300
pca = PCA(n_components=2)   # drop to 2 dimension 
pca.fit(X)                  # Training 
newX=pca.fit_transform(X)   # The data after dimensionality reduction is stored in newX In the list 


##5. Build a word vector dictionary and use kmeans Training , Get the classification 
dict={
    }
for i in range(len(words)):
    word_=words[i]
    dict[word_]=newX[i]
for j in range(len(words)):
    print(words[j]+':',end='')
    print(dict[words[j]])

from sklearn.cluster import KMeans
import numpy as np

X = np.array(newX)
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

print(" The coordinates of the five central words ：")
print(kmeans.cluster_centers_)

list1=[]
list2=[]
list3=[]
list4=[]
list5=[]
for j in range(len(words)):
    if kmeans.labels_[j]==0:
        list1.append(words[j])
    elif kmeans.labels_[j]==1:
        list2.append(words[j])
    elif kmeans.labels_[j]==2:
        list3.append(words[j])
    elif kmeans.labels_[j]==3:
        list4.append(words[j])
    elif kmeans.labels_[j]==4:
        list5.append(words[j])
print(" And keywords "+list1[0]+" Related words are ：",end='')
print(list1)
print(" And keywords "+list2[0]+" Related words are ：",end='')
print(list2)
print(" And keywords "+list3[0]+" Related words are ：",end='')
print(list3)
print(" And keywords "+list4[0]+" Related words are ：",end='')
print(list4)
print(" And keywords "+list5[0]+" Related words are ：",end='')
print(list5)

## Visualize the data with a scatter chart 
f1=[]
f2=[]
for i in range(len(newX)):
    f1.append(newX[i][0])
    f2.append(newX[i][1])
plt.scatter(f1, f2, c='blue', s=6)
plt.show()

The test results Insert picture description here

版权声明
本文为[ddy-ddy]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/04/202204211351090557.html

当前位置：网站首页>K-means clustering based on word2vec

K-means clustering based on word2vec

边栏推荐

猜你喜欢

随机推荐