yesod-test: Improve CSS selector parser.

* Tighten up what the parser will accept (especially wrt identifiers). * Write the parser in a more idiomatic (for *parsec) style.
2014-09-13 18:52:17 +10:00 · 2014-09-13 18:52:17 +10:00 · 581a688cf5
commit 581a688cf5
parent 50f57a3586
1 changed files with 45 additions and 47 deletions
--- a/yesod-test/Yesod/Test/CssQuery.hs
+++ b/yesod-test/Yesod/Test/CssQuery.hs
@ -9,7 +9,10 @@ module Yesod.Test.CssQuery
 import Prelude hiding (takeWhile)
 import Data.Text (Text)
 import Data.Attoparsec.Text
-import Control.Applicative (many, (<|>), optional)
+import Control.Applicative
+import Data.Char
+
+import qualified Data.Text as T

 data SelectorGroup
  = DirectChildren [Selector]
@ -27,6 +30,13 @@ data Selector
  | ByAttrEnds Text Text
  deriving (Show, Eq)

+
+-- The official syntax specification for CSS2 can be found here:
+--      http://www.w3.org/TR/CSS2/syndata.html
+-- but that spec is tricky to fully support. Instead we do the minimal and we
+-- can extend it as needed.
+
+
 -- | Parses a query into an intermediate format which is easy to feed to HXT
 --
 -- * The top-level lists represent the top level comma separated queries.
@ -41,66 +51,54 @@ parseQuery = parseOnly cssQuery

 -- Below this line is the Parsec parser for css queries.
 cssQuery :: Parser [[SelectorGroup]]
-cssQuery = sepBy rules (char ',' >> (optional (char ' ')))
+cssQuery = sepBy rules (char ',' >> optional (char ' '))

 rules :: Parser [SelectorGroup]
 rules = many $ directChildren <|> deepChildren

 directChildren :: Parser SelectorGroup
-directChildren = do
-  _ <- char '>'
-  _ <- char ' '
-  sels <- selectors
-  _ <- optional $ char ' '
-  return $ DirectChildren sels
+directChildren = string "> " >> DirectChildren <$> parseSelectors

 deepChildren :: Parser SelectorGroup
-deepChildren = do 
-  sels <- selectors
-  _ <- optional $ char ' '
-  return $ DeepChildren sels
-  
-selectors :: Parser [Selector]
-selectors = many1 $ parseId
-  <|> parseClass
-  <|> parseTag
-  <|> parseAttr
+deepChildren = DeepChildren <$> parseSelectors
+
+parseSelectors :: Parser [Selector]
+parseSelectors = pOptionalTrailingSpace . many1 $
+    parseId <|> parseClass <|> parseTag <|> parseAttr

 parseId :: Parser Selector
-parseId = do
-  _ <- char '#'
-  x <- takeWhile $ flip notElem ",#.[ >"
-  return $ ById x
+parseId = char '#' >> ById <$> pIdent

 parseClass :: Parser Selector
-parseClass = do
-  _ <- char '.'
-  x <- takeWhile $ flip notElem ",#.[ >"
-  return $ ByClass x
+parseClass = char '.' >> ByClass <$> pIdent

 parseTag :: Parser Selector
-parseTag = do
-  x <- takeWhile1 $ flip notElem ",#.[ >"
-  return $ ByTagName x
+parseTag = ByTagName <$> pIdent

 parseAttr :: Parser Selector
-parseAttr = do
-  _ <- char '['
-  name <- takeWhile $ flip notElem ",#.=$^*]"
-  (parseAttrExists name)
-    <|> (parseAttrWith "=" ByAttrEquals name)
-    <|> (parseAttrWith "*=" ByAttrContains name)
-    <|> (parseAttrWith "^=" ByAttrStarts name)
-    <|> (parseAttrWith "$=" ByAttrEnds name)
+parseAttr = pSquare $ choice
+    [ ByAttrEquals <$> pIdent <*> (string "=" *> pAttrValue)
+    , ByAttrContains <$> pIdent <*> (string "*=" *> pAttrValue)
+    , ByAttrStarts <$> pIdent <*> (string "^=" *> pAttrValue)
+    , ByAttrEnds <$> pIdent <*> (string "$=" *> pAttrValue)
+    , ByAttrExists <$> pIdent
+    ]

-parseAttrExists :: Text -> Parser Selector
-parseAttrExists attrname = do
-  _ <- char ']'
-  return $ ByAttrExists attrname
+-- | pIdent : Parse an identifier (not yet supporting escapes and unicode as
+-- part of the identifier). Basically the regex: [-]?[_a-zA-Z][_a-zA-Z0-9]*
+pIdent :: Parser Text
+pIdent = pOptionalTrailingSpace $ do
+    leadingMinus <- string "-" <|> pure ""
+    nmstart <- T.singleton <$> satisfy (\c -> isAlpha c || c == '_')
+    nmchar <- takeWhile (\c -> isAlphaNum c || c == '_')
+    return $ T.concat [ leadingMinus, nmstart, nmchar ]

-parseAttrWith :: Text -> (Text -> Text -> Selector) -> Text -> Parser Selector
-parseAttrWith sign constructor name = do
-  _ <- string sign
-  value <- takeWhile $ flip notElem ",#.]"
-  _ <- char ']'
-  return $ constructor name value
+
+pAttrValue :: Parser Text
+pAttrValue = takeWhile (/= ']')
+
+pSquare :: Parser a -> Parser a
+pSquare p = char '[' *> p <* char ']'
+
+pOptionalTrailingSpace :: Parser a -> Parser a
+pOptionalTrailingSpace p = p <* optional (char ' ')