from docx import Document import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import re from nltk.stem import PorterStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.snowball import SnowballStemmer from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet import collections import uuid import requests import hashlib import time import json interpunctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '_', '-', '—', '"', '“', '”', "'"]
stops = set(stopwords.words("english")) docxs1 = [Document('./word版/'+str(year)+'年考研英语一真题.docx') for year in range(2018, 2021)] def get_words(docxs): # 输入文件名列表,输出单词列表(去标点数字符号,题目要求标题) words = [] # 储存单词 for doc in docxs: # 遍历每一个文件 for para in doc.paragraphs: # 遍历文件中的每一个段落 if len(para.runs)>=1 and len(para.text)>=1: # 确实是有效段落 if para.runs[0].font.bold or para.runs[0].font.italic: # 如果是粗体或者斜体,不处理 continue # s = re.sub(r'[.*?]', ' ', para.text) s = re.sub(r'[[A-Z]+]', ' ', para.text) # 去掉特殊符号 s = re.sub(r'[0-9]+', ' ', s) s = re.sub(r'②|③|①|④|⑤|⑥|⑦|⑧|⑨|⑩|⑪|⑫|⑬|⑭|⑮|⑯|⑰|_|—|-|\.', ' ', s) s = re.sub(r'\[0-9a-z]', ' ', s) # print(s) s_words = word_tokenize(s) # 分词
# print(s_words) cutwords = [word for word in s_words if word not in interpunctuations] cutwords = [word.lower() for word in cutwords if word not in stops] # 去除停用词与标点 if cutwords: if cutwords[0] == 'read' or cutwords[0] == 'translate': # 有的翻译和阅读题目介绍没有粗体和斜体 continue words+=cutwords return words def get_wordnet_pos(tag): # 词性转化翻译 if tag.startswith('J'): return wordnet.ADJ elif tag.startswith('V'): return wordnet.VERB elif tag.startswith('N'): return wordnet.NOUN elif tag.startswith('R'): return wordnet.ADV else: return '' def get_stems(words): # 获得词干 lem = WordNetLemmatizer() # 词形还原 words_tag = nltk.pos_tag(words) words_final = [] for couple in words_tag: if couple[0][0]<'a' or couple[0][0]>'z' or len(couple[0])<=1: # 去除非单词及空格 continue if get_wordnet_pos(couple[1]): words_final.append(lem.lemmatize(couple[0], get_wordnet_pos(couple[1]))) else: words_final.append(lem.lemmatize(couple[0])) return(words_final) # APPID 与 秘钥 appid = '****' secretKey = '********' myurl = 'https://openapi.youdao.com/api'
File "/usr/lib/python3.6/zipfile.py", line 1198, in _RealGetContents raise BadZipFile("File is not a zip file") zipfile.BadZipFile: File is not a zip file
https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 hugging face https://colab.research.google.com/drive/1W7aEdDPxC29jP99GGZphUlqjMFFVKtBC?usp=sharing#scrollTo=SmxBX5UIzl_o Bertopic可视化
from gensim import corpora, models, similarities import gensim import re from nltk.corpus import stopwords from nltk.tokenize import word_tokenize
def process_articles(text): text = text.lower() text = text.replace('【', '') text = text.replace('】', '') text = text.replace('《', '') text = text.replace('》', '') text = re.sub('[\u4e00-\u9fa5]', ' ', text) #英文文档竟然出现了汉字,也是醉了 text = re.sub(r'/', ' ', text) text = re.sub(r'//', ' ', text) text = re.sub(r'\\', ' ', text) text = re.sub(r'\\\\', ' ', text) text = re.sub(r'-', ' ', text) #把 "-" 的两个单词分开(比如:july-edu ==> july edu) text = re.sub(r'--', ' ', text) text = re.sub(r'—', ' ', text) text = re.sub(r'\d+', '', text) #去掉数字 words = word_tokenize(text) english_stopwords = stopwords.words('english') addition_stopwords = ['\'re', '\'s', '\'t', '\'m', '\'ll', '\'ve', '\'d', 'n\'t'] english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '-', '..', '...', '......', '|', '``', '\'', '\'\'', '<', '>', '~', '+', '/', '//', '"']
filter_words = [w for w in words if (w not in english_stopwords)] filter_words = [w for w in filter_words if (w not in addition_stopwords)] filter_words = [w for w in filter_words if (w not in english_punctuations)]
result = [] for filter_word in filter_words: pattern = re.compile('.*[0-9]+.*') #用正则过滤后还是出现了数字,再过滤一次 match = pattern.findall(filter_word)
# 关于网址和邮箱的写法,不同文档也是五花八门,无奈写不出一个好的正则,只能在这里过滤掉 if ('www' not in filter_word and '.com' not in filter_word and '@' not in filter_word and len( filter_word) > 1 and not match): result.append(filter_word)
return result def mark_title_distribute(): #对每篇文档,标记主题分布 with open(output_file_path, 'w', encoding='utf-8') as f: for i in range(len(titles)): f.write(titles[i]) topic_pro = lda.get_document_topics(corpus[i]) f.write(str(topic_pro) + '\n') f.close() def Merge_title_content(): #在保存语料时,我的做法是一行标题,一行内容。因此在取出时,需要将标题和内容合起来作为一篇文档 i = 1 for line in data_file.readlines(): if i % 2 == 1: titles.append(line) title = process_articles(line) texts.append(title) elif i % 2 == 0: body = process_articles(line) index = int(i / 2 - 1) texts[index] = texts[index] + body i = i + 1 data_file.close() if __name__ == '__main__': data_file_path = 'd:/2023.txt' output_file_path = 'd:/output.txt' data_file = open(data_file_path, encoding='utf-8') texts = [] titles = [] Merge_title_content() dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #词频统计
from bokeh.plotting import figure, show from bokeh.models import ColumnDataSource, HoverTool from bokeh.palettes import Category20_20 import pandas as pd
d = gensim_models.prepare(lda, corpus, dictionary) pyLDAvis.show(d)
遇到的问题
第一个问题
1 2 3 4 5 6
Traceback (most recent call last): File "C:\Users\13772\Desktop\所有文件\untitled\2022\12月\12月29号词频\考研英语\test.py", line 85, in <module> pyLDAvis.show(d) File "C:\Users\13772\AppData\Roaming\Python\Python39\site-packages\pyLDAvis\_display.py", line 262, in show '/LDAvis.css': ["text/css", open(urls.LDAVIS_CSS_URL, 'r').read()], OSError: [Errno 22] Invalid argument: 'https://cdn.jsdelivr.net/gh/bmabey/pyLDAvis@3.3.1/pyLDAvis/js/ldavis.v1.0.0.css'
解决方法
修改源码
将True改为False
1
经过以上功能用例的测试,可以说明本系统基础功能和LDA主题模型功能都正常,运行结果均符合预期目标。
报错原因
如果不修改为False那么不能调研远程的JS服务,从而实现不了LDA可视化
第二个问题
1 2 3 4 5 6 7 8 9 10
Traceback (most recent call last): File "C:\Users\13772\Desktop\所有文件\untitled\2022\12月\12月29号词频\考研英语\test.py", line 84, in <module> d = gensim_models.prepare(lda, corpus, dictionary) File "C:\Users\13772\AppData\Roaming\Python\Python39\site-packages\pyLDAvis\gensim_models.py", line 123, in prepare return pyLDAvis.prepare(**opts) File "C:\Users\13772\AppData\Roaming\Python\Python39\site-packages\pyLDAvis\_prepare.py", line 439, in prepare topic_info = _topic_info(topic_term_dists, topic_proportion, File "C:\Users\13772\AppData\Roaming\Python\Python39\site-packages\pyLDAvis\_prepare.py", line 246, in _topic_info default_term_info = default_term_info.sort_values( TypeError: drop() takes from 1 to 2 positional arguments but 3 were given
[1] 王祖超, 袁晓如. 轨迹数据可视分析研究[J]. 计算机辅助设计与图形学学报, 2015, 27(1): 9-25. [2] 刘明吉, 王秀峰. 数据挖掘中的数据预处理[J]. 计算机科学, 2000, 27(4): 54-57. [3] 周水庚, 李丰, 陶宇飞, 等. 面向数据库应用的隐私保护研究综述[J]. 计算机学报, 2009, 32(5): 847-861. [4] 徐宝文, 张卫丰. 数据挖掘技术在 Web 预取中的应用研究[J]. 计算机学报, 2001, 24(4): 430-436. [5] 周庆, 牟超, 杨丹. 教育数据挖掘研究进展综述[J]. 软件学报, 2015, 26(11): 3026-3042. [6] Souri A, Hosseini R. A state-of-the-art survey of malware detection approaches using data mining techniques[J]. Human-centric Computing and Information Sciences, 2018, 8(1): 1-22. [7] Hasan M K, Ghazal T M, Alkhalifah A, et al. Fischer linear discrimination and quadratic discrimination analysis–based data mining technique for internet of things framework for Healthcare[J]. Frontiers in Public Health, 2021, 9: 737149. [8] Lin G, Zhang J, Luo W, et al. Software vulnerability discovery via learning multi-domain knowledge bases[J]. IEEE Transactions on Dependable and Secure Computing, 2019, 18(5): 2469-2485. [9] Eckhart M, Ekelhart A, Weippl E. Automated security risk identification using AutomationML-based engineering data[J]. IEEE Transactions on Dependable and Secure Computing, 2020, 19(3): 1655-1672. [10] Luo Y, Qin X, Tang N, et al. Deepeye: Towards automatic data visualization[C]//2018 IEEE 34th international conference on data engineering (ICDE). IEEE, 2018: 101-112.