如何解决ASP.NET试图从字符串中剥离html
| 因此,我有一个cms,用户可以通过cuteeditor输入内容,该命令可以正常工作,然后在我的网站上显示此数据。很少发生但令人讨厌的一件事是用户在其文本中输入某些标记,这使得该字体看起来与页面上的其他字体不同,即<span style=\"font-size: 11pt\">Special Olympics Ireland provides year round sports training and athletic competition in a variety of Olympic type sports for persons with intellectual disabilities in </span><span style=\"font-size: 11pt\">Ireland</span><span style=\"font-size: 11pt\"> and </span><span style=\"font-size: 11pt\">Northern Ireland</span><span style=\"font-size: 11pt\"> in accordance with and furtherance of the mission,goal and founding principles of the international Special Olympics movement.</span>
基本上我想做的是
String.Replace(\"<span style=\"font-size: 11pt\">\",\"\")
但是,当然,下一次他们只能使用8、9或10的字体大小时,只能捕获上述情况,因此filter方法必须像这样聪明。
有任何想法吗 ?
所以目前我有类似
testSpan = Regex.Replace(testSpan,@ \“ \\ s] +)?)+ \\ s * | \\ s *)/?> \”,String.Empty);
但是它基本上摆脱了所有的html我只是想摆脱标签
解决方法
如果要遵循StackOverflow的示例,则可以创建允许的HTML标签的白名单,然后删除其余部分。
以下是Jeff Atwood用于清理和平衡StackOverflow用户生成的内容中的HTML标签的代码段。
清理http://refactormycode.com/codes/333-sanitize-html
平衡http://refactormycode.com/codes/360-balance-html-tags
允许标签的列表https://meta.stackexchange.com/questions/1777/what-html-tags-are-allowed
更新资料
看起来Refactormycode已死。这是我在此之前捕获的一些代码:
/// <summary>
/// Provides some static extension methods for processing strings with HTML in them.
/// </summary>
public static class HtmlStripper
{
#region Sanitize
private static readonly Regex Tags = new Regex(\"<[^>]*(>|$)\",RegexOptions.Singleline | RegexOptions.ExplicitCapture |
RegexOptions.Compiled);
private static readonly Regex Whitelist =
new Regex(
@\"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\\s?/?>$\",RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled |
RegexOptions.IgnorePatternWhitespace);
private static readonly Regex WhitelistA =
new Regex(
@\"
^<a\\s
href=\"\"(\\#\\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\\(\\)]+)\"\"
(\\stitle=\"\"[^\"\"<>]+\"\")?(\\starget=\"\"[^\"\"<>]+\"\")?\\s?>$|
^</a>$\",RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled |
RegexOptions.IgnorePatternWhitespace);
private static readonly Regex WhitelistImg =
new Regex(
@\"
^<img\\s
src=\"\"https?://[-a-z0-9+&@#/%?=~_|!:,.;\\(\\)]+\"\"
(\\swidth=\"\"\\d{1,3}\"\")?
(\\sheight=\"\"\\d{1,3}\"\")?
(\\salt=\"\"[^\"\"<>]*\"\")?
(\\stitle=\"\"[^\"\"<>]*\"\")?
\\s?/?>$\",RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled |
RegexOptions.IgnorePatternWhitespace);
/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using
/// a whitelist based approach,leaving the \"safe\" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
/// <remarks>
/// Based on Jeff Atwood\'s code,found at http://refactormycode.com/codes/333-sanitize-html
/// Since Jeff Atwood is StackOverflow\'s administrator,this is most likely the code used by
/// that site. See http://meta.stackoverflow.com/questions/1777/what-html-tags-are-allowed
/// for a list of allowed tags.
/// </remarks>
public static string SanitizeHtml(string html)
{
if (String.IsNullOrEmpty(html)) return html;
// match every HTML tag in the input
MatchCollection tags = Tags.Matches(html);
for (int i = tags.Count - 1; i > -1; i--)
{
Match tag = tags[i];
string tagname = tag.Value.ToLowerInvariant();
if (!(Whitelist.IsMatch(tagname) || WhitelistA.IsMatch(tagname) || WhitelistImg.IsMatch(tagname)))
{
html = html.Remove(tag.Index,tag.Length);
}
}
return html;
}
#endregion
#region Balance tags
private static readonly Regex Namedtags = new Regex
(@\"</?(?<tagname>\\w+)[^>]*(\\s|$|>)\",RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
/// <summary>
/// attempt to balance HTML tags in the html string
/// by removing any unmatched opening or closing tags
/// IMPORTANT: we *assume* HTML has *already* been
/// sanitized and is safe/sane before balancing!
///
/// CODESNIPPET: A8591DBA-D1D3-11DE-947C-BA5556D89593
/// </summary>
/// <remarks>
/// From Jeff Atwood\'s post at
/// http://refactormycode.com/codes/360-balance-html-tags
/// </remarks>
public static string BalanceTags(string html)
{
if (String.IsNullOrEmpty(html)) return html;
// convert everything to lower case; this makes
// our case insensitive comparisons easier
MatchCollection tags = Namedtags.Matches(html.ToLowerInvariant());
// no HTML tags present? nothing to do; exit now
int tagcount = tags.Count;
if (tagcount == 0) return html;
const string ignoredtags = \"<p><img><br><li><hr>\";
var tagpaired = new bool[tagcount];
var tagremove = new bool[tagcount];
// loop through matched tags in forward order
for (int ctag = 0; ctag < tagcount; ctag++)
{
string tagname = tags[ctag].Groups[\"tagname\"].Value;
// skip any already paired tags
// and skip tags in our ignore list; assume they\'re self-closed
if (tagpaired[ctag] || ignoredtags.Contains(\"<\" + tagname + \">\")) continue;
string tag = tags[ctag].Value;
int match = -1;
if (tag.StartsWith(\"</\"))
{
// this is a closing tag
// search backwards (previous tags),look for opening tags
for (int ptag = ctag - 1; ptag >= 0; ptag--)
{
string prevtag = tags[ptag].Value;
if (!tagpaired[ptag] && prevtag.Equals(\"<\" + tagname,StringComparison.InvariantCulture))
{
// minor optimization; we do a simple possibly incorrect match above
// the start tag must be <tag> or <tag{space} to match
if (prevtag.StartsWith(\"<\" + tagname + \">\") || prevtag.StartsWith(\"<\" + tagname + \" \"))
{
match = ptag;
break;
}
}
}
}
else
{
// this is an opening tag
// search forwards (next tags),look for closing tags
for (int ntag = ctag + 1; ntag < tagcount; ntag++)
{
if (!tagpaired[ntag] &&
tags[ntag].Value.Equals(\"</\" + tagname + \">\",StringComparison.InvariantCulture))
{
match = ntag;
break;
}
}
}
// we tried,regardless,if we got this far
tagpaired[ctag] = true;
if (match == -1) tagremove[ctag] = true; // mark for removal
else tagpaired[match] = true; // mark paired
}
// loop through tags again,this time in reverse order
// so we can safely delete all orphaned tags from the string
for (int ctag = tagcount - 1; ctag >= 0; ctag--)
{
if (tagremove[ctag])
{
html = html.Remove(tags[ctag].Index,tags[ctag].Length);
}
}
return html;
}
#endregion
}
, 对于这种事情,您实际上应该使用适当的HTML解析器。
, 这是我用来从VB.NET中的字符串中剥离HTML的函数:
Public Shared Function StripHTML(ByVal htmlString As String) As String
Dim pattern As String = \"<(.|\\n)*?>\"
Return Regex.Replace(htmlString,pattern,String.Empty)
End Function
希望能帮助到你
, 对于这种特定情况,您可以执行以下操作
String input = @\"<span style=\"\"font-size: 11pt\"\">Special Olympics Ireland provides year round sports training and athletic competition in a variety of Olympic type sports for persons with intellectual disabilities in </span><span style=\"\"font-size: 11pt\"\">Ireland</span><span style=\"\"font-size: 11pt\"\"> and </span><span style=\"\"font-size: 11pt\"\">Northern Ireland</span><span style=\"\"font-size: 11pt\"\"> in accordance with and furtherance of the mission,goal and founding principles of the international Special Olympics movement.</span>\";
var element = XElement.Parse(input.Replace(\" \",\" \"));
string stripped = element.Value;
但通常您不希望直接在html上处理任何类型的字符串操作或解析。最好使用其他答案中指出的解析器。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。