Função c# que extrai caracteres chineses no código HTML

Autor：Eve Cole Data da Última Atualização：2025-03-31 09:48:02

/// <summary>

/// 去除 html 标记

/// </summary>

/// <param name = "strhtml"> 包括 html 的源码 </param>

/// <Bretns> 已经去除后的文字 </lortns>

public static string striphtml (string strhtml)

{

string [] aryreg = {

@"<script [^>]*?>.*? </script>",

@"<(/// s*)?!? ((/w+:)?/w+) (/w+(/s*=?/s*(([" "']) (arquivo: // [" "' tbnr] | [^/7])*?/7 |/w+).

@"([/r/n]) [/s]+",

@"& (Quot |#34);",

@"& (amp |#38);",

@"& (lt |#60);",

@"& (gt |#62);",

@"& (nbsp |#160);",

@"& (iexcl |#161);",

@"& (Cent |#162);",

@"& (libra |#163);",,

@"& (cópia |#169);",

@"&#(/d+);",

@"->",

@"<!-.*/n"

};

String [] aryrep = {

"",

"/" ",

"&",

"<",

">",

"",

"/xa1", // chr (161),

"/xa2", // chr (162),

"/xa3", // chr (163),

"/xa9", // chr (169),

"",

"/r/n",

};

string newreg = aryreg [0];

string stroutput = strhtml;

for (int i = 0; i <aryreg.length; i ++)

{

regex regex = new regex (aryreg [i], regexoptions.ignorecase);

stroutput = regex.Replace (stroutput, aryrep [i]);

}

stroutput.Replace ("<", "");

stroutput.Replace (">", "");

stroutput.Replace ("/r/n", "");

retornar stroutput;

}