diff --git a/README.md b/README.md index 8d55c86..99f20d0 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ It is recommended to integrate this so that it is automatically used whenever an Credit =========== Original code was taken from John MacFarlane's Pandoc (with permission), but modified to be faster and with parsing redone using TagSoup. html5lib is now being used as a reference (BSD style license). -Michael Snoyman added the balanced tags functionality. +Michael Snoyman added the balanced tags functionality and released css-text specifically to help with css parsing. Limitations @@ -57,12 +57,16 @@ The [source code of html5lib](http://code.google.com/p/html5lib/source/browse/py If anyone knows of better sources or thinks a particular tag/attribute/value may be vulnerable, please let me know. [HTML Purifier](http://htmlpurifier.org/live/smoketests/printDefinition.php) does have a more permissive and configurable (yet safe) white list if you are looking to add anything. -attributes data and style +style attribute +---------------- +style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved. + +data attributes ------------------------- -These attributes are not on the white list. -The href attribute is white listed, but its value must pass through a white list also. This is how the data and style attributes could work also. However, this was never implemented in Pandoc, and the html5lib code is complicated and relies on regular expressions that I don't understand. +data attributes are not on the white list. +The href attribute is white listed, but its value must pass through a white list also. This is how the data attributes could work also. svg and mathml -------------- -A mathml white list is fully implemented. -There is a white list for svg elements and attributes. However, some elements are not included because they need further filtering (just like the data and style html attributes) +A mathml white list is fully implemented. There is some support for svg styling. +There is a full white list for svg elements and attributes. However, some elements are not included because they need further filtering (just like the data attributes) and this has not been done yet. diff --git a/Text/HTML/SanitizeXSS.hs b/Text/HTML/SanitizeXSS.hs index 137ee14..72f000d 100644 --- a/Text/HTML/SanitizeXSS.hs +++ b/Text/HTML/SanitizeXSS.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} module Text.HTML.SanitizeXSS ( sanitize , sanitizeBalance @@ -6,10 +7,14 @@ module Text.HTML.SanitizeXSS , safeTags ) where +import Text.HTML.SanitizeXSS.Css + import Text.HTML.TagSoup import Data.Set (Set(), member, notMember, (\\), fromList) import Data.Char ( toLower ) +import Data.Text (Text) +import qualified Data.Text as T import Network.URI ( parseURIReference, URI (..), isAllowedInURI, escapeURIString, uriScheme ) @@ -17,32 +22,27 @@ import Codec.Binary.UTF8.String ( encodeString ) import qualified Data.Map as Map -{- -import Debug.Trace -debug :: (Show a) => a -> a -debug a = trace ("DEBUG: " ++ show a) a - -} -- | santize the html to prevent XSS attacks. See README.md for more details -sanitize :: String -> String +sanitize :: Text -> Text sanitize = sanitizeXSS -- | alias of sanitize function -sanitizeXSS :: String -> String +sanitizeXSS :: Text -> Text sanitizeXSS = filterTags safeTags -- | same as sanitize but makes sure there are no lone closing tags. See README.md for more details -sanitizeBalance :: String -> String +sanitizeBalance :: Text -> Text sanitizeBalance = filterTags (balance Map.empty . safeTags) -- | insert custom tag filtering. Don't forget to compose your filter with safeTags! -filterTags :: ([Tag String] -> [Tag String]) -> String -> String +filterTags :: ([Tag Text] -> [Tag Text]) -> Text -> Text filterTags f = renderTagsOptions renderOptions { optMinimize = \x -> x `elem` ["br","img"] -- converts to , converts to } . f . canonicalizeTags . parseTags -balance :: Map.Map String Int -> [Tag String] -> [Tag String] +balance :: Map.Map Text Int -> [Tag Text] -> [Tag Text] balance m [] = concatMap go $ Map.toList m where @@ -67,28 +67,33 @@ balance m (TagOpen name as : tags) = balance m (t:ts) = t : balance m ts -- | Filters out any usafe tags and attributes. Use with filterTags to create a custom filter. -safeTags :: [Tag String] -> [Tag String] +safeTags :: [Tag Text] -> [Tag Text] safeTags [] = [] safeTags (t@(TagClose name):tags) | safeTagName name = t : safeTags tags | otherwise = safeTags tags safeTags (TagOpen name attributes:tags) - | safeTagName name = TagOpen name (filter safeAttribute attributes) : safeTags tags + | safeTagName name = TagOpen name + (map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags | otherwise = safeTags tags safeTags (t:tags) = t:safeTags tags -safeTagName :: String -> Bool +safeTagName :: Text -> Bool safeTagName tagname = tagname `member` sanitaryTags -safeAttribute :: (String, String) -> Bool +safeAttribute :: (Text, Text) -> Bool safeAttribute (name, value) = name `member` sanitaryAttributes && - (name `notMember` attrValIsUri || sanitaryURI value) + (name `notMember` uri_attributes || sanitaryURI value) + +sanitizeAttribute :: (Text, Text) -> (Text, Text) +sanitizeAttribute ("style", value) = ("style", sanitizeCSS value) +sanitizeAttribute attrs = attrs -- | Returns @True@ if the specified URI is not a potential security risk. -sanitaryURI :: String -> Bool +sanitaryURI :: Text -> Bool sanitaryURI u = - case parseURIReference (escapeURI u) of + case parseURIReference (escapeURI $ T.unpack u) of Just p -> (null (uriScheme p)) || ((map toLower $ init $ uriScheme p) `member` safeURISchemes) Nothing -> False @@ -102,19 +107,21 @@ escapeURI = escapeURIString isAllowedInURI . encodeString safeURISchemes :: Set String safeURISchemes = fromList acceptable_protocols -sanitaryTags :: Set String +sanitaryTags :: Set Text sanitaryTags = fromList (acceptable_elements ++ mathml_elements ++ svg_elements) \\ (fromList svg_allow_local_href) -- extra filtering not implemented -sanitaryAttributes :: Set String -sanitaryAttributes = fromList (acceptable_attributes ++ mathml_attributes ++ svg_attributes) +sanitaryAttributes :: Set Text +sanitaryAttributes = fromList (allowed_html_uri_attributes ++ acceptable_attributes ++ mathml_attributes ++ svg_attributes) \\ (fromList svg_attr_val_allows_ref) -- extra unescaping not implemented -attrValIsUri :: Set String -attrValIsUri = fromList ["href", "src", "cite", "action", "longdesc", - "xlink:href", "xml:base"] +allowed_html_uri_attributes :: [Text] +allowed_html_uri_attributes = ["href", "src", "cite", "action", "longdesc"] -acceptable_elements :: [String] +uri_attributes :: Set Text +uri_attributes = fromList $ allowed_html_uri_attributes ++ ["xlink:href", "xml:base"] + +acceptable_elements :: [Text] acceptable_elements = ["a", "abbr", "acronym", "address", "area", "article", "aside", "audio", "b", "big", "blockquote", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", @@ -129,7 +136,7 @@ acceptable_elements = ["a", "abbr", "acronym", "address", "area", "tbody", "td", "textarea", "time", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var", "video"] -mathml_elements :: [String] +mathml_elements :: [Text] mathml_elements = ["maction", "math", "merror", "mfrac", "mi", "mmultiscripts", "mn", "mo", "mover", "mpadded", "mphantom", "mprescripts", "mroot", "mrow", "mspace", "msqrt", "mstyle", "msub", @@ -137,7 +144,7 @@ mathml_elements = ["maction", "math", "merror", "mfrac", "mi", "munderover", "none"] -- this should include altGlyph I think -svg_elements :: [String] +svg_elements :: [Text] svg_elements = ["a", "animate", "animateColor", "animateMotion", "animateTransform", "clipPath", "circle", "defs", "desc", "ellipse", "font-face", "font-face-name", "font-face-src", "g", "glyph", "hkern", @@ -145,27 +152,27 @@ svg_elements = ["a", "animate", "animateColor", "animateMotion", "mpath", "path", "polygon", "polyline", "radialGradient", "rect", "set", "stop", "svg", "switch", "text", "title", "tspan", "use"] -acceptable_attributes :: [String] +acceptable_attributes :: [Text] acceptable_attributes = ["abbr", "accept", "accept-charset", "accesskey", - "action", "align", "alt", "autocomplete", "autofocus", "axis", + "align", "alt", "autocomplete", "autofocus", "axis", "background", "balance", "bgcolor", "bgproperties", "border", "bordercolor", "bordercolordark", "bordercolorlight", "bottompadding", "cellpadding", "cellspacing", "ch", "challenge", "char", "charoff", - "choff", "charset", "checked", "cite", "class", "clear", "color", + "choff", "charset", "checked", "class", "clear", "color", "cols", "colspan", "compact", "contenteditable", "controls", "coords", -- "data", TODO: allow this with further filtering "datafld", "datapagesize", "datasrc", "datetime", "default", "delay", "dir", "disabled", "draggable", "dynsrc", "enctype", "end", "face", "for", "form", "frame", "galleryimg", "gutter", "headers", - "height", "hidefocus", "hidden", "high", "href", "hreflang", "hspace", + "height", "hidefocus", "hidden", "high", "hreflang", "hspace", "icon", "id", "inputmode", "ismap", "keytype", "label", "leftspacing", - "lang", "list", "longdesc", "loop", "loopcount", "loopend", + "lang", "list", "loop", "loopcount", "loopend", "loopstart", "low", "lowsrc", "max", "maxlength", "media", "method", "min", "multiple", "name", "nohref", "noshade", "nowrap", "open", "optimum", "pattern", "ping", "point-size", "prompt", "pqg", "radiogroup", "readonly", "rel", "repeat-max", "repeat-min", "replace", "required", "rev", "rightspacing", "rows", "rowspan", - "rules", "scope", "selected", "shape", "size", "span", "src", "start", + "rules", "scope", "selected", "shape", "size", "span", "start", "step", -- "style", TODO: allow this with further filtering "summary", "suppress", "tabindex", "target", @@ -179,7 +186,7 @@ acceptable_protocols = [ "ed2k", "ftp", "http", "https", "irc", "xmpp", "callto", "feed", "urn", "aim", "rsync", "tag", "ssh", "sftp", "rtsp", "afs" ] -mathml_attributes :: [String] +mathml_attributes :: [Text] mathml_attributes = ["actiontype", "align", "columnalign", "columnalign", "columnalign", "columnlines", "columnspacing", "columnspan", "depth", "display", "displaystyle", "equalcolumns", "equalrows", "fence", @@ -190,7 +197,7 @@ mathml_attributes = ["actiontype", "align", "columnalign", "columnalign", "separator", "stretchy", "width", "width", "xlink:href", "xlink:show", "xlink:type", "xmlns", "xmlns:xlink"] -svg_attributes :: [String] +svg_attributes :: [Text] svg_attributes = ["accent-height", "accumulate", "additive", "alphabetic", "arabic-form", "ascent", "attributeName", "attributeType", "baseProfile", "bbox", "begin", "by", "calcMode", "cap-height", @@ -221,43 +228,14 @@ svg_attributes = ["accent-height", "accumulate", "additive", "alphabetic", "y1", "y2", "zoomAndPan"] -- the values for these need to be escaped -svg_attr_val_allows_ref :: [String] +svg_attr_val_allows_ref :: [Text] svg_attr_val_allows_ref = ["clip-path", "color-profile", "cursor", "fill", "filter", "marker", "marker-start", "marker-mid", "marker-end", "mask", "stroke"] -svg_allow_local_href :: [String] +svg_allow_local_href :: [Text] svg_allow_local_href = ["altGlyph", "animate", "animateColor", "animateMotion", "animateTransform", "cursor", "feImage", "filter", "linearGradient", "pattern", "radialGradient", "textpath", "tref", "set", "use"] -{- style value (css) filtering not implemented - - - - this is used for css filtering -allowed_svg_properties = fromList acceptable_svg_properties -acceptable_svg_properties = [ "fill", "fill-opacity", "fill-rule", - "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", - "stroke-opacity"] - - -allowed_css_properties = fromList acceptable_css_properties -allowed_css_keywords = fromList acceptable_css_keywords -acceptable_css_properties = ["azimuth", "background-color", - "border-bottom-color", "border-collapse", "border-color", - "border-left-color", "border-right-color", "border-top-color", "clear", - "color", "cursor", "direction", "display", "elevation", "float", "font", - "font-family", "font-size", "font-style", "font-variant", "font-weight", - "height", "letter-spacing", "line-height", "overflow", "pause", - "pause-after", "pause-before", "pitch", "pitch-range", "richness", - "speak", "speak-header", "speak-numeral", "speak-punctuation", - "speech-rate", "stress", "text-align", "text-decoration", "text-indent", - "unicode-bidi", "vertical-align", "voice-family", "volume", - "white-space", "width"] -acceptable_css_keywords = ["auto", "aqua", "black", "block", "blue", - "bold", "both", "bottom", "brown", "center", "collapse", "dashed", - "dotted", "fuchsia", "gray", "green", "!important", "italic", "left", - "lime", "maroon", "medium", "none", "navy", "normal", "nowrap", "olive", - "pointer", "purple", "red", "right", "solid", "silver", "teal", "top", - "transparent", "underline", "white", "yellow"] --} diff --git a/Text/HTML/SanitizeXSS/Css.hs b/Text/HTML/SanitizeXSS/Css.hs new file mode 100644 index 0000000..4705097 --- /dev/null +++ b/Text/HTML/SanitizeXSS/Css.hs @@ -0,0 +1,105 @@ +{-# LANGUAGE OverloadedStrings #-} +module Text.HTML.SanitizeXSS.Css (sanitizeCSS) where + +import Data.Text (Text) +import qualified Data.Text as T +import Data.Attoparsec.Text +import Data.Text.Lazy.Builder (toLazyText) +import Data.Text.Lazy (toStrict) +import Data.Set (member, fromList, Set) +import Data.Char (isDigit) +import Control.Applicative ((<|>)) +import Text.CSS.Render (renderAttrs) +import Text.CSS.Parse (parseAttrs) + + +-- this is a direct translation from sanitizer.py, except +-- sanitizer.py filters out url(), but this is redundant +sanitizeCSS :: Text -> Text +sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $ parseAttributes + where + parseAttributes = case parseOnly parseAttrs css of + Left _ -> [] + Right as -> as + + isSanitaryAttr (prop, value) + | prop `member` allowed_css_properties = True + | (T.takeWhile (/= '-') value) `member` allowed_css_unit_properties && + all allowedCssAttributeValue (T.words value) = True + | prop `member` allowed_svg_properties = True + | otherwise = False + + allowed_css_unit_properties :: Set Text + allowed_css_unit_properties = fromList ["background","border","margin","padding"] + + allowedCssAttributeValue :: Text -> Bool + allowedCssAttributeValue val = + val `member` allowed_css_keywords || + case parseOnly allowedCssAttributeParser val of + Left _ -> False + Right b -> b + where + allowedCssAttributeParser = do + hex <|> rgb <|> cssUnit + + aToF = fromList "abcdef" + + hex = do + _ <- char '#' + hx <- takeText + return $ T.all (\c -> isDigit c || (c `member` aToF)) hx + + rgb = do + _<- string "rgb(" + skip isDigit >> try (skipWhile isDigit) >> try (skip (== '%')) + skip (== ',') + try (skipWhile isDigit) >> try (skip (== '%')) + skip (== ',') + try (skipWhile isDigit) >> try (skip (== '%')) + skip (== ',') + skip (== ')') + return True + + cssUnit = do + try $ skip isDigit >> skip isDigit + try $ skip (== '.') + try $ skip isDigit >> skip isDigit + unit <- takeText + return $ unit `member` allowed_css_attribute_value_units + +allowed_css_attribute_value_units :: Set Text +allowed_css_attribute_value_units = fromList + [ "cm", "em", "ex", "in", "mm", "pc", "pt", "px", "%", ",", "\\"] + +allowed_css_properties :: Set Text +allowed_css_properties = fromList acceptable_css_properties + where + acceptable_css_properties = ["azimuth", "background-color", + "border-bottom-color", "border-collapse", "border-color", + "border-left-color", "border-right-color", "border-top-color", "clear", + "color", "cursor", "direction", "display", "elevation", "float", "font", + "font-family", "font-size", "font-style", "font-variant", "font-weight", + "height", "letter-spacing", "line-height", "overflow", "pause", + "pause-after", "pause-before", "pitch", "pitch-range", "richness", + "speak", "speak-header", "speak-numeral", "speak-punctuation", + "speech-rate", "stress", "text-align", "text-decoration", "text-indent", + "unicode-bidi", "vertical-align", "voice-family", "volume", + "white-space", "width"] + +allowed_css_keywords :: Set Text +allowed_css_keywords = fromList acceptable_css_keywords + where + acceptable_css_keywords = ["auto", "aqua", "black", "block", "blue", + "bold", "both", "bottom", "brown", "center", "collapse", "dashed", + "dotted", "fuchsia", "gray", "green", "!important", "italic", "left", + "lime", "maroon", "medium", "none", "navy", "normal", "nowrap", "olive", + "pointer", "purple", "red", "right", "solid", "silver", "teal", "top", + "transparent", "underline", "white", "yellow"] + +-- used in css filtering +allowed_svg_properties :: Set Text +allowed_svg_properties = fromList acceptable_svg_properties + where + acceptable_svg_properties = [ "fill", "fill-opacity", "fill-rule", + "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", + "stroke-opacity"] diff --git a/test.hs b/test.hs deleted file mode 100644 index 6ff26c3..0000000 --- a/test.hs +++ /dev/null @@ -1,21 +0,0 @@ -import Text.HTML.SanitizeXSS - -testHTML = " safeanchor

Unbalanced" - -test f actual expected = do - putStrLn $ "testing: " ++ actual - putStrLn $ if f actual == expected then "pass" else "failure\n" ++ "\nexpected:" ++ (show expected) ++ "\nactual: " ++ (show actual) - -main = do - test sanitizeBalance testHTML " safeanchor
Unbalanced
" - test sanitize testHTML " safeanchor
Unbalanced" - let testRelativeURI = "bar" - test sanitize testRelativeURI testRelativeURI - let protocol_hack = "" - test sanitize protocol_hack "" - let object_hack = "" - test sanitize object_hack "" - let embed_hack = "" - test sanitize embed_hack "" - let ucase_image_hack = "" - test sanitize ucase_image_hack "" diff --git a/xss-sanitize.cabal b/xss-sanitize.cabal index 6da1237..5318c07 100644 --- a/xss-sanitize.cabal +++ b/xss-sanitize.cabal @@ -1,5 +1,5 @@ name: xss-sanitize -version: 0.2.6 +version: 0.3.0 license: BSD3 license-file: LICENSE author: Greg Weber @@ -9,17 +9,39 @@ description: run untrusted HTML through Text.HTML.SanitizeXSS.sanitizeXSS to category: Web stability: Stable -cabal-version: >= 1.6 +cabal-version: >= 1.8 build-type: Simple homepage: http://github.com/gregwebs/haskell-xss-sanitize library - build-depends: base == 4.*, containers, - tagsoup >= 0.11, utf8-string >= 0.3, network >= 2 + build-depends: base == 4.*, containers + , tagsoup >= 0.11 + , utf8-string >= 0.3 + , network >= 2 + , css-text >= 0.1 && < 0.2 + , text >= 0.11 && < 0.12 + , attoparsec-text >= 0.8.5.1 && < 0.9 + , file-location + exposed-modules: Text.HTML.SanitizeXSS + other-modules: Text.HTML.SanitizeXSS.Css ghc-options: -Wall +test-suite test + type: exitcode-stdio-1.0 + main-is: test/main.hs + build-depends: base >= 4 && < 5, containers + , xss-sanitize >= 0.3 + , tagsoup >= 0.11 + , utf8-string >= 0.3 + , network >= 2 + , css-text >= 0.1 && < 0.2 + , text >= 0.11 && < 0.12 + , attoparsec-text >= 0.8.5.1 && < 0.9 + , file-location + + source-repository head type: git location: http://github.com/gregwebs/haskell-xss-sanitize.git