using System; using System.Text; using System.Text.RegularExpressions; using System.Web; namespace Server.Utils { /** * Collection of generic text utilities */ public static class TextUtils { /** * Filters HTML tags from supplied text * * @param string txt * @param bool allowHTML If false, all HTML is filtered. If true, only unapproved HTML is filtered * @return string filtered text */ public static string FilterText(string txt, bool allowHTML) { if (allowHTML == true) { return filterHTML(txt); // TODO: add linksOnly support } else { return stripEvil(txt); } } #region private methods /** * filterHTML strips all html except for allowed tags. * It reformats what appear to be valid tags with a predefined template. * Allowed tags are anchor links (<a href=""></a>) and images (<img src="" />) * @param string txt the text to filter * @requires addHTTP() * @requires stripTags() * @requires stripMal() * @example * 1. filterHTML('<blink>I am a blinky tag</blink> and <a href="javascript:window.close()">bad stuff</a> some text <a href="hreflink" onload="dobadstuff()" style="font-size:200px">atext</a> midtext <img src="asdf" onload="domorebadstuff()"> endtext') * 2. filterHTML('just some plain text') * 3. filterHTML('<a href="http://something.com">this is a link with no closing a tag') * 4. filterHTML('<a href="data:text/plain,<script>badStuff();</script>">click me</a>') * @returns {string} reformated as valid HTML without malicious bits. * From example: * 1. 'I am a blinky tag and <a href="">bad stuff</a> some text <a href="hreflink">atext</a> midtext <img src="asdf" /> endtext' * 2. 'just some plain text' * 3. 'this is a link with no closing a tag' OR (with feedback enabled) alerts message 'you are missing a closing link tag. Please add </a> somewhere in your post.', return false * 4. '<a href="">click me</a>' * @return {string} filtered text * optional configuration to handle errors (otherwise, just removes bad entries): * @return feedback {object}: * feedback { success: {bool}, data: {string} // return text (filtered) if success is TRUE or error message if success is FALSE } * test data: an image: <img src="http://min3.net/comics/WebComic-009_IRC_Lingo.gif" alt="asdf" /> the end! an image: <img src="min3.net/comics/WebComic-009_IRC_Lingo.gif" onmouseover="asdf()">text!</img> the end! a link: <a href="www.google.com">Google</a> the end! a bad link: <a href="javascript:alert(1);">alert</a> the end! a bad link with html: <a href="data:text/plain,<script>badStuff();</script>">click me</a> the end! a link around an image: <a href="google.com"><img src="http://min3.net/comics/WebComic-009_IRC_Lingo.gif"> </img>caption for image</a> the end! a link around an image with bad stuff: <a href="http://google.com" onclick="evil()">asdf <img src="asdf.jpg" onclick="evil()" />image caption</a> the end! */ private static string filterHTML(string txt) { // valid html templates string aTag = "<a href=\"{0}\">{1}</a>"; string imgTag = "<img src=\"{0}\" />"; // 'a' tag OR 'img' tag OR any other tag string regHTML = "(<a[^>]*href[ ]*=[\"' ]?([^'\" >]*)[^>]*>((?:(?<!</a>).)*)</a>)|"; regHTML += "(<img[^>]*src[ ]*?=(?:[\"' ]?)*([^'\" >]*)[^>]*?>)|(<[^>]*?>)"; /* Regex breakdown for regHTML: globally search for html tags within text and capture anchors, images to rebuild safely match an <a> tag: (<a[^>]*href\s*=\s*[""']?([^'"" >]*)[^>]*?>(.*)</a>) : $a ([^'"" >]*?) : $href (.*) : $aLink match an <img> tag: (<img[^>]*src\s*=\s*[""']?([^'"" >]*)[^>]*?>) : $img ([^'"" >]*?) : $src match any other tag: (<[^>]*?>) : $otherTag match captures the following, attribed to values in the replace function: $0 = $match: full match $1 = $a: full 'a' tag (if matched) $2 = $href: href link (must run through stripMal and strip_html) $3 = $aLink: 'a' tag text (the visible link -- must allow <img> tags and will need to be run through filterHTML and stripMal) $4 = $img: full 'img' tag (if matched) $5 = $src: image source (must run through stripMal and strip_html) $6 = $otherTag: other tag (should be totally stripped) */ txt = Regex.Replace(txt, regHTML, delegate(Match m) { if (string.IsNullOrEmpty(m.Groups[0].Value)) // no match (shouldn't happen) return String.Empty; // no html to replace if (string.IsNullOrEmpty(m.Groups[1].Value) == false) { // matched 'a' tag string href = m.Groups[2].Value; string aLink = m.Groups[3].Value; // strip_html must strip html and 'javascript:' 'vbscript:' etc... return String.Format(aTag, fixURL(href), filterHTML(stripMal(aLink))); } if (string.IsNullOrEmpty(m.Groups[4].Value) == false)// matched 'img' tag { string src = m.Groups[5].Value; return String.Format(imgTag, fixURL(src)); } if (string.IsNullOrEmpty(m.Groups[6].Value) == false) // matched any other tag return HttpUtility.HtmlEncode(m.Groups[6].Value); // encode it //return String.Empty; // wipe it return String.Empty; // should never get here (missed something?)--should log error to server }, RegexOptions.IgnoreCase | RegexOptions.Multiline); return txt; } /** * addHTTP makes certain a link or src has http:// in the begining (to prevent links like http://x.com/www.google.com) * @param string txt a link * @return string link with http:// */ private static string addHTTP(string txt) { return txt.IndexOf("http") != 0 ? "http://" + txt : txt; } /** * fixURL removes any invalid characters from an url and uses addHTTP to verify correct formatting * @requires addHTTP() * @param string url is the link * @return string a valid and corrected URL */ private static string fixURL(string url) { return addHTTP(Regex.Replace(stripEvil(url), @"[^\w#!:.,?'~+=$*()&%@!\-_/]*", string.Empty, RegexOptions.Multiline)); } /** * stripEvil does all the security cleaning we need for all client submitted data * MUST USE THIS OR ANOTHER FILTER BEFORE ANYTHING GOES INTO THE DATABASE! * @param string txt is the text to filter * @return string text stripped of all bad stuff */ private static string stripEvil(string txt) { return stripMal(stripTags(txt)); } /** * stripMal strips all malicious script injection code * @param string txt is the text to filter * @return string text stripped of any malicious script declaraion ('javascript:', 'vbscript:', etc...) */ private static string stripMal(string txt) { // need to block [java|vb|...]script:, data:, etc // so we remove any character before a colon that is not http(s): return Regex.Replace(txt, @"(?:java|vb)?(?:script|data):", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Multiline); } /** * stripTags strips all tags (elements within <brackets>) * @param string txt is the text to filter * @return string text stripped of any tags */ private static string stripTags(string txt) { return Regex.Replace(txt, @"<[^>]*?>", string.Empty, RegexOptions.Multiline); } #endregion } }