defread_file(file_input): """读取文件,不做任何处理""" with open(file_input, encoding="utf-8", errors="ignore") as fin: content = [line.strip() for line in fin] return content
读取停用词文件
也可以是其他词表之类的单个词为一行,且需要去重的文件
1 2 3 4
defload_dic(file_input): """读取文件,返回set""" with open(file_input, encoding="utf-8", errors="ignore") as fin: return set([i.strip() for i in fin])
读取idf文件
格式如下:
word idf
凳子 8.36728816325
1 2 3 4 5 6 7 8
defload_idf(file_input): """读取idf词典""" words_idf = {} with open(file_input, encoding='utf-8', errors="ignore") as fin: for i in fin: word, idf = i.strip().split() words_idf[word] = float(idf) return words_idf
读取词向量文件
格式如下:一般词向量文件第一行会显示info
word vec
的 1 2 3 4 5 6 7 …
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
defload_word2vec(vec_file, word_dim): "读取词向量文件" "param:vec_file:词向量文件路径" "word_dim:向量维度" "return:词与array类型向量的字典" word2vec = {} with open(vec_file, encoding='utf-8', errors="ignore") as fin: fin.readline() for i in fin: line = i.strip().split(" ",1) if len(line) != 2: logging.error("vec file maybe err, please check!") continue word, vec = line[0], line[1] try: word2vec[word] = np.array([float(i) for i in vec.split()]).reshape(1, word_dim) except Exception as e: raise ValueError("vec size is not equal %s, error is %s"%(word_dim,e)) return word2vec
"注意:" "这里把输出的文件直接放在原路径下,文件名和原文件名相同并加后缀。" "如果要改输出可以自己改改代码或者提issue我有时间改改" "按照num分割文件速度比较慢,后面有时间优化下速度" try: n = int(num) line_num = str(int(line_num)) except Exception as e: raise ValueError(f"n or line_num must be a number, please check it!")
if file_in.startswith("."): prefix = "."+"".join(file_in.split(".")[:-1]) else: prefix = "".join(file_in.split(".")[:-1])
logging.info("file prefix is ", prefix) len_file = get_file_len(file_in)
with open(file_in, encoding='utf-8') as fin: for line_num, line_text in enumerate(fin): sub_text.append(line_text) if line_num == each_file_len * (idx + 1): idx += 1 all_text.append(sub_text) sub_text = [] logging.warning(f"file {idx} append success") if need_remaining: all_text.append(sub_text) # 多余的不足平均行数的文本 logging.warning(f"file {idx} append success")
for idx, text in enumerate(all_text): write_file = f"{prefix}_{idx+1}.txt" with open(write_file, 'w', encoding='utf-8') as fout: for i in text: fout.write(i) else: raise ValueError("please input right method one of num,line_num,mem")
defmerge_and_write(base_dir, write_path, is_dfs=False): """将一个文件夹下的多个文件写入一个文件 param: base_dir:原文件路径 write_dir:写入文件路径 if_dfs:是否递归写入 true:将文件夹内所有文件夹下的文件全部写入 false:只写入该文件夹下的文件,不写子文件夹下的文件 return: None""" with open(write_path, 'w', encoding='utf-8') as fout: for dirpath, dirname, filenames in os.walk(base_dir): for sub_file in filenames: file_path = os.path.join(dirpath, sub_file) for i in read_file(file_path): fout.write(f"{i}\n") logging.info(f"writing file {file_path} success!!!") if is_dfs: break