# -*- coding: utf-8 -*-
from bs4 import BeautifulSoupdef second_pro(text):last_sen = []while 1:last_sen.append(text.split(",")[-1])other_list = text.split(",")[:-1]# 特定位置超长,需要处理if len(other_list) == 1:last_sen.append(other_list[0].split(",")[-1])other_list = other_list[0].split(",")[:-1]if len(",".join(other_list)) < 508:return ",".join(other_list), last_sentext = ",".join(other_list[:-1])def maching(text):hold_l = []con_text = []for one in text.split(","):hold_l.append(one)if len(",".join(hold_l)) > 508:model_text, last_sen = second_pro(",".join(hold_l))hold_l = []hold_l.append(",".join(last_sen))con_text.append(model_text)return con_textdef html_parse(text):soup = BeautifulSoup(text, 'html.parser')qq = soup.get_text().split("\n")moo = []s = []for one in qq:if one == "":continueif one == "\t":continuemoo.append(one.strip())for om in moo:if om == "":continues.append(om.replace(" ", "").replace(" ", ""))return ",".join(s)if __name__ == '__main__':text = """输入Network中Doc内所有内容"""aa = html_parse(text)print(aa, "\n")
text内容如下图: