当前位置：网站首页>Literature retrieval operation code

Literature retrieval operation code

2022-08-09 08:09:00 【happy learning】

作业二

#检索词
import os
string=[]
#获得字符串
for i in range(1,6):
    filename="D:/Documents/Desktop/文献检索/第二次作业/doc/doc_"+str(i)+".txt"
    with open(filename,'r',encoding='utf-8') as files:
        text = files.read()
    string.append(text)
    
    
    

#Split the sentence into a series of words,Words are represented by lists
def split_list(p):
# A variable needs to be defined to record the start of the word
    a="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    words = []  # 建立一个空列表
    index = 0   # 遍历所有的字符
    start = 0   # Record where each word starts
    while index < len(p):   # 当index小于p的长度
        start = index       # start来记录位置
        while p[index] != " " and p[index] in a:   # 若不是空格,点号,逗号
            index += 1   # index加一
            if index == len(p):  # 若遍历完成
                break   # 结束
        words.append(p[start:index])
        if index == len(p):
            break
        while p[index] == " " or p[index] not in a:
            index += 1
            if index ==len(p):
                break     
    return words

#Form words first and save all words
wordlist=[[],[],[],[],[]]
for i in range(0,5):
    wordlist[i]=split_list(string[i])

#First count different words and construct word vectors
wordvector=[]
for i in range(0,5):
    for word in wordlist[i]:
        if word not in wordvector:
            wordvector.append(word.lower())

length=len(wordvector)        
#Then build five vectors,并生成向量
def dicvector(wl):
    dl=[]
    for i in range(0,length):
        dl.append(0)
    j=0
    for w in wl:
        count=wordvector.index(w.lower())
        if dl[count]==1:
            continue
        else:
            dl[count]=1
    return dl
    
dic1=dicvector(wordlist[0])
dic2=dicvector(wordlist[1])
dic3=dicvector(wordlist[2])
dic4=dicvector(wordlist[3])
dic5=dicvector(wordlist[4])

print("Please enter an index value:\n")
yourword='in'

yourwordvector=[]
for i in range(0,length):
    yourwordvector.append(0)
c=wordvector.index(yourword)
yourwordvector[c]=1

#find inner product and modulo
def mo(list):
    he=0
    for i in list:
        if i==1:
            he=he+1
    return he
def neiji(list1,list2):
    he=0
    ll=len(list1)
    for i in range(0,ll):
        he=he+list1[i]*list2[i]
    return he
result1=neiji(yourwordvector,dic1)/(mo(dic1))
result2=neiji(yourwordvector,dic2)/(mo(dic2))
result3=neiji(yourwordvector,dic3)/(mo(dic3))
result4=neiji(yourwordvector,dic4)/(mo(dic4))
result5=neiji(yourwordvector,dic5)/(mo(dic5))
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)

作业三

#检索词
#df_tf
import os
import pickle
import math
from numpy import power
string=[]
N=5
#获得字符串
for i in range(1,6):
    filename="D:/Documents/Desktop/文献检索/第二次作业/doc/doc_"+str(i)+".txt"
    with open(filename,'r',encoding='utf-8') as files:
        text = files.read()
    string.append(text)
    
    
    

#Split the sentence into a series of words,Words are represented by lists
def split_list(p):
# A variable needs to be defined to record the start of the word
    a="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    words = []  # 建立一个空列表
    index = 0   # 遍历所有的字符
    start = 0   # Record where each word starts
    while index < len(p):   # 当index小于p的长度
        start = index       # start来记录位置
        while p[index] != " " and p[index] in a:   # 若不是空格,点号,逗号
            index += 1   # index加一
            if index == len(p):  # 若遍历完成
                break   # 结束
        words.append(p[start:index])
        if index == len(p):
            break
        while p[index] == " " or p[index] not in a:
            index += 1
            if index ==len(p):
                break     
    return words
#Alter the array string
def listtolower(li):
    j=0
    for i in li:
        li[j]=i.lower()
        j=j+1

#Form words first and save all words
wordlist=[[],[],[],[],[]]
for i in range(0,5):
    wordlist[i]=split_list(string[i])
    listtolower(wordlist[i])


#First count different words and construct word vectors
wordvector=[]
for i in range(0,5):
    for word in wordlist[i]:
        if word not in wordvector:
            wordvector.append(word.lower())

length=len(wordvector)
#数组中元素的个数
def listmoment(a,b):
    cc=0
    for i in b:
        if i==a:
            cc=cc+1
    return cc

#wordl中各字符数量
tf=[]
for i in wordvector:
    ccc=listmoment(i,wordlist[0])+listmoment(i,wordlist[1])+listmoment(i,wordlist[2])+listmoment(i,wordlist[3])+listmoment(i,wordlist[4])
    tf.append(ccc)

#wordin The number of times it appears in the paper
wordin=[]
for i in wordvector:
    cccc=0
    if i in wordlist[0]:
        cccc=cccc+1
    if i in wordlist[1]:
        cccc=cccc+1
    if i in wordlist[2]:
        cccc=cccc+1
    if i in wordlist[3]:
        cccc=cccc+1
    if i in wordlist[4]:
        cccc=cccc+1
    wordin.append(cccc)
        
#计算idft

idft=[]
for i in range(0,length):
    aaa=math.log(N/wordin[i])
    idft.append(aaa)
#计算tf_idf
tf_idf=[]
for i in range(0,length):
    aa=tf[i]*idft[i]
    tf_idf.append(aa)

#Then start calculating the inner product

yourword='across'

yourwordvector=[]
for i in range(0,length):
    yourwordvector.append(0)
c=wordvector.index(yourword)
yourwordvector[c]=idft[c]

#find inner product and modulo
def mo(list1):
    he=0
    for i in list1:dw
        he=he+power(i,2)
    return power(he,1/2)
def neiji(list1,list2):
    he=0
    ll=len(list1)
    for i in range(0,ll):
        he=he+list1[i]*list2[i]
    return he


result=neiji(yourwordvector,tf_idf)/(mo(yourwordvector)*mo(tf_idf))

print(result)

#filename1="D:/Documents/Desktop/文献检索/第二次作业/doc/result.txt"
#with open(filename1,mode='w') as file1:
# file1.write(str(tf_idf))

原网站

版权声明
本文为[happy learning]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/221/202208090759231702.html

当前位置：网站首页>Literature retrieval operation code

Literature retrieval operation code

边栏推荐

猜你喜欢

随机推荐