当前位置:网站首页>Literature retrieval operation code
Literature retrieval operation code
2022-08-09 08:09:00 【happy learning】
作业二
#检索词
import os
string=[]
#获得字符串
for i in range(1,6):
filename="D:/Documents/Desktop/文献检索/第二次作业/doc/doc_"+str(i)+".txt"
with open(filename,'r',encoding='utf-8') as files:
text = files.read()
string.append(text)
#Split the sentence into a series of words,Words are represented by lists
def split_list(p):
# A variable needs to be defined to record the start of the word
a="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
words = [] # 建立一个空列表
index = 0 # 遍历所有的字符
start = 0 # Record where each word starts
while index < len(p): # 当index小于p的长度
start = index # start来记录位置
while p[index] != " " and p[index] in a: # 若不是空格,点号,逗号
index += 1 # index加一
if index == len(p): # 若遍历完成
break # 结束
words.append(p[start:index])
if index == len(p):
break
while p[index] == " " or p[index] not in a:
index += 1
if index ==len(p):
break
return words
#Form words first and save all words
wordlist=[[],[],[],[],[]]
for i in range(0,5):
wordlist[i]=split_list(string[i])
#First count different words and construct word vectors
wordvector=[]
for i in range(0,5):
for word in wordlist[i]:
if word not in wordvector:
wordvector.append(word.lower())
length=len(wordvector)
#Then build five vectors,并生成向量
def dicvector(wl):
dl=[]
for i in range(0,length):
dl.append(0)
j=0
for w in wl:
count=wordvector.index(w.lower())
if dl[count]==1:
continue
else:
dl[count]=1
return dl
dic1=dicvector(wordlist[0])
dic2=dicvector(wordlist[1])
dic3=dicvector(wordlist[2])
dic4=dicvector(wordlist[3])
dic5=dicvector(wordlist[4])
print("Please enter an index value:\n")
yourword='in'
yourwordvector=[]
for i in range(0,length):
yourwordvector.append(0)
c=wordvector.index(yourword)
yourwordvector[c]=1
#find inner product and modulo
def mo(list):
he=0
for i in list:
if i==1:
he=he+1
return he
def neiji(list1,list2):
he=0
ll=len(list1)
for i in range(0,ll):
he=he+list1[i]*list2[i]
return he
result1=neiji(yourwordvector,dic1)/(mo(dic1))
result2=neiji(yourwordvector,dic2)/(mo(dic2))
result3=neiji(yourwordvector,dic3)/(mo(dic3))
result4=neiji(yourwordvector,dic4)/(mo(dic4))
result5=neiji(yourwordvector,dic5)/(mo(dic5))
print(result1)
print(result2)
print(result3)
print(result4)
print(result5)
作业三
#检索词
#df_tf
import os
import pickle
import math
from numpy import power
string=[]
N=5
#获得字符串
for i in range(1,6):
filename="D:/Documents/Desktop/文献检索/第二次作业/doc/doc_"+str(i)+".txt"
with open(filename,'r',encoding='utf-8') as files:
text = files.read()
string.append(text)
#Split the sentence into a series of words,Words are represented by lists
def split_list(p):
# A variable needs to be defined to record the start of the word
a="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
words = [] # 建立一个空列表
index = 0 # 遍历所有的字符
start = 0 # Record where each word starts
while index < len(p): # 当index小于p的长度
start = index # start来记录位置
while p[index] != " " and p[index] in a: # 若不是空格,点号,逗号
index += 1 # index加一
if index == len(p): # 若遍历完成
break # 结束
words.append(p[start:index])
if index == len(p):
break
while p[index] == " " or p[index] not in a:
index += 1
if index ==len(p):
break
return words
#Alter the array string
def listtolower(li):
j=0
for i in li:
li[j]=i.lower()
j=j+1
#Form words first and save all words
wordlist=[[],[],[],[],[]]
for i in range(0,5):
wordlist[i]=split_list(string[i])
listtolower(wordlist[i])
#First count different words and construct word vectors
wordvector=[]
for i in range(0,5):
for word in wordlist[i]:
if word not in wordvector:
wordvector.append(word.lower())
length=len(wordvector)
#数组中元素的个数
def listmoment(a,b):
cc=0
for i in b:
if i==a:
cc=cc+1
return cc
#wordl中各字符数量
tf=[]
for i in wordvector:
ccc=listmoment(i,wordlist[0])+listmoment(i,wordlist[1])+listmoment(i,wordlist[2])+listmoment(i,wordlist[3])+listmoment(i,wordlist[4])
tf.append(ccc)
#wordin The number of times it appears in the paper
wordin=[]
for i in wordvector:
cccc=0
if i in wordlist[0]:
cccc=cccc+1
if i in wordlist[1]:
cccc=cccc+1
if i in wordlist[2]:
cccc=cccc+1
if i in wordlist[3]:
cccc=cccc+1
if i in wordlist[4]:
cccc=cccc+1
wordin.append(cccc)
#计算idft
idft=[]
for i in range(0,length):
aaa=math.log(N/wordin[i])
idft.append(aaa)
#计算tf_idf
tf_idf=[]
for i in range(0,length):
aa=tf[i]*idft[i]
tf_idf.append(aa)
#Then start calculating the inner product
yourword='across'
yourwordvector=[]
for i in range(0,length):
yourwordvector.append(0)
c=wordvector.index(yourword)
yourwordvector[c]=idft[c]
#find inner product and modulo
def mo(list1):
he=0
for i in list1:dw
he=he+power(i,2)
return power(he,1/2)
def neiji(list1,list2):
he=0
ll=len(list1)
for i in range(0,ll):
he=he+list1[i]*list2[i]
return he
result=neiji(yourwordvector,tf_idf)/(mo(yourwordvector)*mo(tf_idf))
print(result)
#filename1="D:/Documents/Desktop/文献检索/第二次作业/doc/result.txt"
#with open(filename1,mode='w') as file1:
# file1.write(str(tf_idf))
边栏推荐
猜你喜欢
System Security and Application
EMQ X message server learning record - prepare for the subsequent completion
3D软件开发工具HOOPS全套产品开发介绍 | HOOPS Exchange、HOOPS Communicator
浅谈Endpoint
如何生成dll文件 采用VS2017生成dll文件(动态库文件)和lib文件(静态库文件)以C语言为例
VMware virtual machine cannot be connected to the Internet after forced shutdown
Redis(八)集群
基于appinventor与EasyDL物体检测API的物体检测app
权限(上)
SOLIDWORKS 2022新功能直播揭秘!速来围观!
随机推荐
C语言笔记 学习预处理 学习宏定义
监视文本框的输入
JS基础1
浅谈Endpoint
转换为onnx模型错误汇总
文件处理(IO)
C: print the diamond
2019 Nanchang Internet Competition Question C, Hello 2019
Jmeter连接Mysql和Mysql编码问题
一文搞懂 条件编译和预处理指令 #define、#undef、#ifdef、#ifndef、#if、#elif、#else、#endif、defined 详解
Cookie和Session详解
Shell--常用小工具(sort、uniq、tr、cut)
.net(二) 配置数据库
Unity 3D模型展示框架篇之资源打包、加载、热更(二)
.net(一)WebService创建
我这是来宣传一下
eTS UI开发学习
web基本概念
nvm安装以及管理多版本node教程
进程和计划任务