src/Text/Toml/Parser.hs

{-# LANGUAGE FlexibleContexts  #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes        #-}
{-# LANGUAGE CPP               #-}

module Text.Toml.Parser
  ( module Text.Toml.Parser
  , module Text.Toml.Types
  ) where

import           Control.Applicative hiding (many, optional, (<|>))
import           Control.Monad

import qualified Data.HashMap.Strict as M
import qualified Data.List           as L
import qualified Data.Set            as S
import           Data.Text           (Text, pack, unpack)
import qualified Data.Vector         as V

#if MIN_VERSION_time(1,5,0)
import           Data.Time.Format    (defaultTimeLocale, iso8601DateFormat,
                                      parseTimeM)
#else
import           Data.Time.Format    (parseTime)
import           System.Locale       (defaultTimeLocale, iso8601DateFormat)
#endif

import           Numeric             (readHex)
import           Text.Parsec

import           Text.Toml.Types

-- Imported as last to fix redundancy warning
import           Prelude             hiding (concat, takeWhile)


-- | Our very own Parser type.
type Parser a = forall s. Parsec Text s a


-- | Convenience function for the test suite and GHCI.
parseOnly :: Parsec Text (S.Set [Text]) a -> Text -> Either ParseError a
parseOnly p str = runParser (p <* eof)  S.empty "test" str


-- | Parses a complete document formatted according to the TOML spec.
tomlDoc :: Parsec Text (S.Set [Text]) Table
tomlDoc = do
    skipBlanks
    topTable <- table
    namedSections <- many namedSection
    -- Ensure the input is completely consumed
    eof
    -- Load each named section into the top table
    foldM (flip (insert Explicit)) topTable namedSections

-- | Parses a table of key-value pairs.
table :: Parser Table
table = do
    pairs <- try (many (assignment <* skipBlanks)) <|> (try skipBlanks >> return [])
    case maybeDupe (map fst pairs) of
      Just k  -> fail $ "Cannot redefine key " ++ (unpack k)
      Nothing -> return $ M.fromList pairs

-- | Parses an inline table of key-value pairs.
inlineTable :: Parser Node
inlineTable = do
    pairs <- between (char '{') (char '}') (skipSpaces *> separatedValues <* skipSpaces)
    case maybeDupe (map fst pairs) of
      Just k  -> fail $ "Cannot redefine key " ++ (unpack k)
      Nothing -> return $ VTable $ M.fromList pairs
  where
    skipSpaces      = many (satisfy isSpc)
    separatedValues = sepBy (skipSpaces *> assignment <* skipSpaces) comma
    comma           = skipSpaces >> char ',' >> skipSpaces

-- | Find dupes, if any.
maybeDupe :: Ord a => [a] -> Maybe a
maybeDupe xx = dup xx S.empty
  where
    dup []     _ = Nothing
    dup (x:xs) s = if S.member x s then Just x else dup xs (S.insert x s)


-- | Parses a 'Table' or 'TableArray' with its header.
-- The resulting tuple has the header's value in the first position, and the
-- 'NTable' or 'NTArray' in the second.
namedSection :: Parser ([Text], Node)
namedSection = do
    eitherHdr <- try (Left <$> tableHeader) <|> try (Right <$> tableArrayHeader)
    skipBlanks
    tbl <- table
    skipBlanks
    return $ case eitherHdr of Left  ns -> (ns, VTable tbl )
                               Right ns -> (ns, VTArray $ V.singleton tbl)


-- | Parses a table header.
tableHeader :: Parser [Text]
tableHeader = between (char '[') (char ']') headerValue


-- | Parses a table array header.
tableArrayHeader :: Parser [Text]
tableArrayHeader = between (twoChar '[') (twoChar ']') headerValue
  where
    twoChar c = count 2 (char c)


-- | Parses the value of any header (names separated by dots), into a list of 'Text'.
headerValue :: Parser [Text]
headerValue = ((pack <$> many1 keyChar) <|> anyStr') `sepBy1` (char '.')
  where
    keyChar = alphaNum <|> char '_' <|> char '-'

-- | Parses a key-value assignment.
assignment :: Parser (Text, Node)
assignment = do
    k <- (pack <$> many1 keyChar) <|> anyStr'
    many (satisfy isSpc) >> char '=' >> skipBlanks
    v <- value
    return (k, v)
  where
    -- TODO: Follow the spec, e.g.: only first char cannot be '['.
    keyChar = alphaNum <|> char '_' <|> char '-'


-- | Parses a value.
value :: Parser Node
value = (try array       <?> "array")
    <|> (try boolean     <?> "boolean")
    <|> (try anyStr      <?> "string")
    <|> (try datetime    <?> "datetime")
    <|> (try float       <?> "float")
    <|> (try integer     <?> "integer")
    <|> (try inlineTable <?> "inline table")


--
-- | * Toml value parsers
--

array :: Parser Node
array = (try (arrayOf array)    <?> "array of arrays")
    <|> (try (arrayOf boolean)  <?> "array of booleans")
    <|> (try (arrayOf anyStr)   <?> "array of strings")
    <|> (try (arrayOf datetime) <?> "array of datetimes")
    <|> (try (arrayOf float)    <?> "array of floats")
    <|> (try (arrayOf integer)  <?> "array of integers")


boolean :: Parser Node
boolean = VBoolean <$> ( (try . string $ "true")  *> return True  <|>
                         (try . string $ "false") *> return False )


anyStr :: Parser Node
anyStr = VString <$> anyStr'

anyStr' :: Parser Text
anyStr' = try multiBasicStr <|> try basicStr <|> try multiLiteralStr <|> try literalStr


basicStr :: Parser Text
basicStr = between dQuote dQuote (fmap pack $ many strChar)
  where
    strChar = try escSeq <|> try (satisfy (\c -> c /= '"' && c /= '\\'))
    dQuote  = char '\"'


multiBasicStr :: Parser Text
multiBasicStr = (openDQuote3 *> escWhiteSpc *> (pack <$> manyTill strChar (try dQuote3)))
  where
    -- | Parse the a tripple-double quote, with possibly a newline attached
    openDQuote3 = try (dQuote3 <* char '\n') <|> try dQuote3
    -- | Parse tripple-double quotes
    dQuote3     = count 3 $ char '"'
    -- | Parse a string char, accepting escaped codes, ignoring escaped white space
    strChar     = (escSeq <|> (satisfy (/= '\\'))) <* escWhiteSpc
    -- | Parse escaped white space, if any
    escWhiteSpc = many $ char '\\' >> char '\n' >> (many $ satisfy (\c -> isSpc c || c == '\n'))


literalStr :: Parser Text
literalStr = between sQuote sQuote (pack <$> many (satisfy (/= '\'')))
  where
    sQuote = char '\''


multiLiteralStr :: Parser Text
multiLiteralStr = (openSQuote3 *> (fmap pack $ manyTill anyChar sQuote3))
  where
    -- | Parse the a tripple-single quote, with possibly a newline attached
    openSQuote3 = try (sQuote3 <* char '\n') <|> try sQuote3
    -- | Parse tripple-single quotes
    sQuote3     = try . count 3 . char $ '\''


datetime :: Parser Node
datetime = do
    d <- try $ manyTill anyChar (char 'Z')
#if MIN_VERSION_time(1,5,0)
    let  mt = parseTimeM True defaultTimeLocale (iso8601DateFormat $ Just "%X") d
#else
    let  mt = parseTime defaultTimeLocale (iso8601DateFormat $ Just "%X") d
#endif
    case mt of Just t  -> return $ VDatetime t
               Nothing -> fail "parsing datetime failed"


-- | Attoparsec 'double' parses scientific "e" notation; reimplement according to Toml spec.
float :: Parser Node
float = VFloat <$> do
    n <- intStr <* lookAhead (satisfy (\c -> c == '.' || c == 'e' || c == 'E'))
    d <- try (satisfy (== '.') *> uintStr) <|> return "0"
    e <- try (satisfy (\c -> c == 'e' || c == 'E') *> intStr) <|> return "0"
    return . read . L.concat $ [n, ".", d, "e", e]
  where
    sign    = try (string "-") <|> (try (char '+') >> return "") <|> return ""
    uintStr = (:) <$> digit <*> many (optional (char '_') *> digit)
    intStr  = do s <- sign
                 u <- uintStr
                 return . L.concat $ [s, u]


integer :: Parser Node
integer = VInteger <$> (signed $ read <$> uintStr)
  where
    uintStr :: Parser [Char]
    uintStr = (:) <$> digit <*> many (optional (char '_') *> digit)

--
-- * Utility functions
--

-- | Parses the elements of an array, while restricting them to a certain type.
arrayOf :: Parser Node -> Parser Node
arrayOf p = (VArray . V.fromList) <$>
                between (char '[') (char ']') (skipBlanks *> separatedValues)
  where
    separatedValues = sepEndBy (skipBlanks *> try p <* skipBlanks) comma <* skipBlanks
    comma           = skipBlanks >> char ',' >> skipBlanks


-- | Parser for escape sequences.
escSeq :: Parser Char
escSeq = char '\\' *> escSeqChar
  where
    escSeqChar =  try (char '"')  *> return '"'
              <|> try (char '\\') *> return '\\'
              <|> try (char '/')  *> return '/'
              <|> try (char 'b')  *> return '\b'
              <|> try (char 't')  *> return '\t'
              <|> try (char 'n')  *> return '\n'
              <|> try (char 'f')  *> return '\f'
              <|> try (char 'r')  *> return '\r'
              <|> try (char 'u')  *> unicodeHex 4
              <|> try (char 'U')  *> unicodeHex 8
              <?> "escape character"


-- | Parser for unicode hexadecimal values of representation length 'n'.
unicodeHex :: Int -> Parser Char
unicodeHex n = do
    h <- count n (satisfy isHex)
    let v = fst . head . readHex $ h
    return $ if v <= maxChar then toEnum v else '_'
  where
    isHex c = (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
    maxChar = fromEnum (maxBound :: Char)


-- | Parser for signs (a plus or a minus).
signed :: Num a => Parser a -> Parser a
signed p =  try (negate <$> (char '-' *> p))
        <|> try (char '+' *> p)
        <|> try p


-- | Parses the (rest of the) line including an EOF, whitespace and comments.
skipBlanks :: Parser ()
skipBlanks = skipMany blank
  where
    blank   = try ((many1 $ satisfy isSpc) >> return ()) <|> try comment <|> try eol
    comment = char '#' >> (many $ satisfy (/= '\n')) >> return ()


-- | Results in 'True' for whitespace chars, tab or space, according to spec.
isSpc :: Char -> Bool
isSpc c = c == ' ' || c == '\t'


-- | Parse an EOL, as per TOML spec this is 0x0A a.k.a. '\n' or 0x0D a.k.a. '\r'.
eol :: Parser ()
eol = (string "\n" <|> string "\r\n") >> return ()