常用的文本去重代码:
本代码用到了simhash库(https://github.com/yanyiwu/simhash)
import re
import os
from pathlib import Path
import json
from simhash import Simhashdef droplike(lines):def distance(v1, v2, f=64):x = (v1 ^ v2) & ((1 << f) - 1)ans = 0while x:ans += 1x &= x - 1return ansvalues = []for line in lines:seq, text = re.split(r'\s+', line, maxsplit=1)text_del = re.sub(r'{[a-z]+\d}|\W', '', text)values.append(Simhash(text_del).value)lines_new = []for i, line in tqdm(enumerate(lines), total=len(lines)):v1 = values[i]flag = Truefor j, v2 in enumerate(values[:i]):dist = distance(v1, v2)if dist <= 3:print(line, lines[j])flag = Falsebreakif flag:lines_new.append(line)return lines_newdef droplike_file(sour_file):lines = []with open(sour_file, 'r', encoding='utf-8') as f_in:for line in f_in:line = line.strip()if not line:continuelines.append(line)lines = droplike(lines)with open(sour_file, 'w', encoding='utf-8') as f_out:for line in lines:seq, text = re.split(r'\s+', line, maxsplit=1)f_out.write('{} {}\n'.format(seq, text))