using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;

namespace Server.Utils
	 * Collection of generic text utilities
    public static class TextUtils
         * Filters HTML tags from supplied text
         * @param string txt
         * @param bool allowHTML If false, all HTML is filtered.  If true, only unapproved HTML is filtered
         * @return string filtered text
        public static string FilterText(string txt, bool allowHTML) {
            if (allowHTML == true)
                return filterHTML(txt); // TODO: add linksOnly support
                return stripEvil(txt);
        #region private methods
         * filterHTML strips all html except for allowed tags. 
		 * It reformats what appear to be valid tags with a predefined template.
		 * Allowed tags are anchor links (<a href=""></a>) and images (<img src="" />)
         * @param string txt the text to filter
         * @requires addHTTP()
         * @requires stripTags()
         * @requires stripMal()
         * @example
         *      1. filterHTML('<blink>I am a blinky tag</blink> and 
					<a href="javascript:window.close()">bad stuff</a> some text 
					<a href="hreflink" onload="dobadstuff()" style="font-size:200px">atext</a> 
					midtext <img src="asdf" onload="domorebadstuff()"> endtext')
         *      2. filterHTML('just some plain text')
         *      3. filterHTML('<a href="">this is a link with no closing a tag')
         *      4. filterHTML('<a href="data:text/plain,<script>badStuff();</script>">click me</a>')
         * @returns {string} reformated as valid HTML without malicious bits.
         * From example:
         *      1. 'I am a blinky tag and <a href="">bad stuff</a> some text <a href="hreflink">atext</a> 
					midtext <img src="asdf" /> endtext'
         *      2. 'just some plain text'
         *      3. 'this is a link with no closing a tag' OR (with feedback enabled) alerts message 
					'you are missing a closing link tag. Please add </a> somewhere in your post.', return false
         *      4. '<a href="">click me</a>'
         * @return {string} filtered text
         * optional configuration to handle errors (otherwise, just removes bad entries):
         *      @return feedback {object}:
         *      feedback {
                    success: {bool},
                    data: {string} // return text (filtered) if success is TRUE or error message if success is FALSE
         * test data:
            an image: <img src="" alt="asdf" /> the end!
            an image: <img src="" onmouseover="asdf()">text!</img> the end!
            a link: <a href="">Google</a> the end!
            a bad link: <a href="javascript:alert(1);">alert</a> the end!
            a bad link with html: <a href="data:text/plain,<script>badStuff();</script>">click me</a>
				the end!
            a link around an image: <a href=""><img src="">
				</img>caption for image</a> the end!
            a link around an image with bad stuff: <a href="" onclick="evil()">asdf 
				<img src="asdf.jpg" onclick="evil()" />image caption</a> the end!
        private static string filterHTML(string txt)
            // valid html templates
            string aTag = "<a href=\"{0}\">{1}</a>";
            string imgTag = "<img src=\"{0}\" />";
            // 'a' tag OR 'img' tag OR any other tag
            string regHTML = "(<a[^>]*href[ ]*=[\"' ]?([^'\" >]*)[^>]*>((?:(?<!</a>).)*)</a>)|";
			regHTML += "(<img[^>]*src[ ]*?=(?:[\"' ]?)*([^'\" >]*)[^>]*?>)|(<[^>]*?>)";
                Regex breakdown for regHTML:
                globally search for html tags within text and capture anchors, images to rebuild safely
                match an <a> tag:
                    (<a[^>]*href\s*=\s*[""']?([^'"" >]*)[^>]*?>(.*)</a>) : $a
                        ([^'"" >]*?)    : $href
                        (.*)            : $aLink
                match an <img> tag:
                    (<img[^>]*src\s*=\s*[""']?([^'"" >]*)[^>]*?>) : $img
                        ([^'"" >]*?)    : $src
                match any other tag:
                    (<[^>]*?>) : $otherTag
                match captures the following, attribed to values in the replace function:
                $0 = $match:    full match
                $1 = $a:        full 'a' tag (if matched)
                $2 = $href:     href link (must run through stripMal and strip_html)
                $3 = $aLink:    'a' tag text (the visible link -- must allow <img> 
								tags and will need to be run through filterHTML and stripMal)
                $4 = $img:      full 'img' tag (if matched)
                $5 = $src:      image source (must run through stripMal and strip_html)
                $6 = $otherTag: other tag (should be totally stripped)

            txt = Regex.Replace(txt, regHTML, delegate(Match m)
                if (string.IsNullOrEmpty(m.Groups[0].Value)) // no match (shouldn't happen)
                    return String.Empty; // no html to replace
                if (string.IsNullOrEmpty(m.Groups[1].Value) == false)
                { // matched 'a' tag
                    string href = m.Groups[2].Value;
                    string aLink = m.Groups[3].Value;
					// strip_html must strip html and 'javascript:' 'vbscript:' etc...
                    return String.Format(aTag, fixURL(href), filterHTML(stripMal(aLink)));
                if (string.IsNullOrEmpty(m.Groups[4].Value) == false)// matched 'img' tag
                    string src = m.Groups[5].Value;
                    return String.Format(imgTag, fixURL(src));
                if (string.IsNullOrEmpty(m.Groups[6].Value) == false) // matched any other tag
                    return HttpUtility.HtmlEncode(m.Groups[6].Value); // encode it
                    //return String.Empty; // wipe it
                return String.Empty; // should never get here (missed something?)--should log error to server
            }, RegexOptions.IgnoreCase | RegexOptions.Multiline);

            return txt;
         * addHTTP makes certain a link or src has http:// in the begining 
		  (to prevent links like
         * @param string txt a link
         * @return string link with http://
        private static string addHTTP(string txt)
            return txt.IndexOf("http") != 0 ? "http://" + txt : txt;
         * fixURL removes any invalid characters from an url and uses addHTTP to verify correct formatting
         * @requires addHTTP()
         * @param string url is the link
         * @return string a valid and corrected URL
        private static string fixURL(string url)
            return addHTTP(Regex.Replace(stripEvil(url), @"[^\w#!:.,?'~+=$*()&%@!\-_/]*", string.Empty, RegexOptions.Multiline));
         * stripEvil does all the security cleaning we need for all client submitted data
         * @param string txt is the text to filter
         * @return string text stripped of all bad stuff
        private static string stripEvil(string txt)
            return stripMal(stripTags(txt));
         * stripMal strips all malicious script injection code
         * @param string txt is the text to filter
         * @return string text stripped of any malicious script declaraion ('javascript:', 'vbscript:', etc...)
        private static string stripMal(string txt)
            // need to block [java|vb|...]script:, data:, etc 
			// so we remove any character before a colon that is not http(s):
            return Regex.Replace(txt, @"(?:java|vb)?(?:script|data):", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Multiline);
         * stripTags strips all tags (elements within <brackets>)
         * @param string txt is the text to filter
         * @return string text stripped of any tags
        private static string stripTags(string txt)
            return Regex.Replace(txt, @"<[^>]*?>", string.Empty, RegexOptions.Multiline);