using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
namespace Server.Utils
{
/**
* Collection of generic text utilities
*/
public static class TextUtils
{
/**
* Filters HTML tags from supplied text
*
* @param string txt
* @param bool allowHTML If false, all HTML is filtered. If true, only unapproved HTML is filtered
* @return string filtered text
*/
public static string FilterText(string txt, bool allowHTML) {
if (allowHTML == true)
{
return filterHTML(txt); // TODO: add linksOnly support
}
else
{
return stripEvil(txt);
}
}
#region private methods
/**
* filterHTML strips all html except for allowed tags.
* It reformats what appear to be valid tags with a predefined template.
* Allowed tags are anchor links (<a href=""></a>) and images (<img src="" />)
* @param string txt the text to filter
* @requires addHTTP()
* @requires stripTags()
* @requires stripMal()
* @example
* 1. filterHTML('<blink>I am a blinky tag</blink> and
<a href="javascript:window.close()">bad stuff</a> some text
<a href="hreflink" onload="dobadstuff()" style="font-size:200px">atext</a>
midtext <img src="asdf" onload="domorebadstuff()"> endtext')
* 2. filterHTML('just some plain text')
* 3. filterHTML('<a href="http://something.com">this is a link with no closing a tag')
* 4. filterHTML('<a href="data:text/plain,<script>badStuff();</script>">click me</a>')
* @returns {string} reformated as valid HTML without malicious bits.
* From example:
* 1. 'I am a blinky tag and <a href="">bad stuff</a> some text <a href="hreflink">atext</a>
midtext <img src="asdf" /> endtext'
* 2. 'just some plain text'
* 3. 'this is a link with no closing a tag' OR (with feedback enabled) alerts message
'you are missing a closing link tag. Please add </a> somewhere in your post.', return false
* 4. '<a href="">click me</a>'
* @return {string} filtered text
* optional configuration to handle errors (otherwise, just removes bad entries):
* @return feedback {object}:
* feedback {
success: {bool},
data: {string} // return text (filtered) if success is TRUE or error message if success is FALSE
}
* test data:
an image: <img src="http://min3.net/comics/WebComic-009_IRC_Lingo.gif" alt="asdf" /> the end!
an image: <img src="min3.net/comics/WebComic-009_IRC_Lingo.gif" onmouseover="asdf()">text!</img> the end!
a link: <a href="www.google.com">Google</a> the end!
a bad link: <a href="javascript:alert(1);">alert</a> the end!
a bad link with html: <a href="data:text/plain,<script>badStuff();</script>">click me</a>
the end!
a link around an image: <a href="google.com"><img src="http://min3.net/comics/WebComic-009_IRC_Lingo.gif">
</img>caption for image</a> the end!
a link around an image with bad stuff: <a href="http://google.com" onclick="evil()">asdf
<img src="asdf.jpg" onclick="evil()" />image caption</a> the end!
*/
private static string filterHTML(string txt)
{
// valid html templates
string aTag = "<a href=\"{0}\">{1}</a>";
string imgTag = "<img src=\"{0}\" />";
// 'a' tag OR 'img' tag OR any other tag
string regHTML = "(<a[^>]*href[ ]*=[\"' ]?([^'\" >]*)[^>]*>((?:(?<!</a>).)*)</a>)|";
regHTML += "(<img[^>]*src[ ]*?=(?:[\"' ]?)*([^'\" >]*)[^>]*?>)|(<[^>]*?>)";
/*
Regex breakdown for regHTML:
globally search for html tags within text and capture anchors, images to rebuild safely
match an <a> tag:
(<a[^>]*href\s*=\s*[""']?([^'"" >]*)[^>]*?>(.*)</a>) : $a
([^'"" >]*?) : $href
(.*) : $aLink
match an <img> tag:
(<img[^>]*src\s*=\s*[""']?([^'"" >]*)[^>]*?>) : $img
([^'"" >]*?) : $src
match any other tag:
(<[^>]*?>) : $otherTag
match captures the following, attribed to values in the replace function:
$0 = $match: full match
$1 = $a: full 'a' tag (if matched)
$2 = $href: href link (must run through stripMal and strip_html)
$3 = $aLink: 'a' tag text (the visible link -- must allow <img>
tags and will need to be run through filterHTML and stripMal)
$4 = $img: full 'img' tag (if matched)
$5 = $src: image source (must run through stripMal and strip_html)
$6 = $otherTag: other tag (should be totally stripped)
*/
txt = Regex.Replace(txt, regHTML, delegate(Match m)
{
if (string.IsNullOrEmpty(m.Groups[0].Value)) // no match (shouldn't happen)
return String.Empty; // no html to replace
if (string.IsNullOrEmpty(m.Groups[1].Value) == false)
{ // matched 'a' tag
string href = m.Groups[2].Value;
string aLink = m.Groups[3].Value;
// strip_html must strip html and 'javascript:' 'vbscript:' etc...
return String.Format(aTag, fixURL(href), filterHTML(stripMal(aLink)));
}
if (string.IsNullOrEmpty(m.Groups[4].Value) == false)// matched 'img' tag
{
string src = m.Groups[5].Value;
return String.Format(imgTag, fixURL(src));
}
if (string.IsNullOrEmpty(m.Groups[6].Value) == false) // matched any other tag
return HttpUtility.HtmlEncode(m.Groups[6].Value); // encode it
//return String.Empty; // wipe it
return String.Empty; // should never get here (missed something?)--should log error to server
}, RegexOptions.IgnoreCase | RegexOptions.Multiline);
return txt;
}
/**
* addHTTP makes certain a link or src has http:// in the begining
(to prevent links like http://x.com/www.google.com)
* @param string txt a link
* @return string link with http://
*/
private static string addHTTP(string txt)
{
return txt.IndexOf("http") != 0 ? "http://" + txt : txt;
}
/**
* fixURL removes any invalid characters from an url and uses addHTTP to verify correct formatting
* @requires addHTTP()
* @param string url is the link
* @return string a valid and corrected URL
*/
private static string fixURL(string url)
{
return addHTTP(Regex.Replace(stripEvil(url), @"[^\w#!:.,?'~+=$*()&%@!\-_/]*", string.Empty, RegexOptions.Multiline));
}
/**
* stripEvil does all the security cleaning we need for all client submitted data
* MUST USE THIS OR ANOTHER FILTER BEFORE ANYTHING GOES INTO THE DATABASE!
* @param string txt is the text to filter
* @return string text stripped of all bad stuff
*/
private static string stripEvil(string txt)
{
return stripMal(stripTags(txt));
}
/**
* stripMal strips all malicious script injection code
* @param string txt is the text to filter
* @return string text stripped of any malicious script declaraion ('javascript:', 'vbscript:', etc...)
*/
private static string stripMal(string txt)
{
// need to block [java|vb|...]script:, data:, etc
// so we remove any character before a colon that is not http(s):
return Regex.Replace(txt, @"(?:java|vb)?(?:script|data):", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Multiline);
}
/**
* stripTags strips all tags (elements within <brackets>)
* @param string txt is the text to filter
* @return string text stripped of any tags
*/
private static string stripTags(string txt)
{
return Regex.Replace(txt, @"<[^>]*?>", string.Empty, RegexOptions.Multiline);
}
#endregion
}
}