Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parsers based parsers #36

Merged
merged 13 commits into from
Nov 26, 2016
52 changes: 38 additions & 14 deletions bench/MainCriterion.hs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ main =
defaultMainWith
(defaultConfig {resamples = 100})
[ env
-- fawltyContent <- T.pack <$> readFile "data/ttl/fawlty1.ttl"
(do rdfContent <- T.pack <$> readFile "bills.099.actions.rdf"
fawltyContentTurtle <- T.pack <$> readFile "data/ttl/fawlty1.ttl"
fawltyContentNTriples <- T.pack <$> readFile "data/nt/all-fawlty-towers.nt"
let (Right rdf1) =
parseString (XmlParser Nothing Nothing) rdfContent
let (Right rdf2) =
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason to not just replace all uses of rdf2 with rdf1? they are guaranteed to be identical.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, they're different types! never mind!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to have type sigs here instead of at their use site.

Expand All @@ -59,21 +60,44 @@ main =
( rdf1 :: RDF TList
, rdf2 :: RDF AdjHashMap
, triples :: Triples
)) $ \ ~(triplesList, adjMap,triples) ->
, fawltyContentNTriples :: T.Text
, fawltyContentTurtle :: T.Text
)) $ \ ~(triplesList, adjMap, triples, fawltyContentNTriples, fawltyContentTurtle) ->
bgroup
"rdf4h"
-- bgroup
-- "parsers"
-- [ bench "AdjHashMap" $
-- nf (parseTtlRDF :: T.Text -> RDF AdjHashMap) fawlty_towers
-- , bench "HashSP" $
-- nf (parseTtlRDF :: T.Text -> RDF HashSP) fawlty_towers
-- , bench "SP" $ nf (parseTtlRDF :: T.Text -> RDF SP) fawlty_towers
-- , bench "TList" $
-- nf (parseTtlRDF :: T.Text -> RDF TList) fawlty_towers
-- ]
-- ,
[ bgroup
[ bgroup
"parsers"
[ bench "ntriples-parsec" $
nf (\t ->
let res = parseNTriplesStringParsec t :: Either ParseFailure (RDF TList)
in case res of
Left e -> error (show e)
Right rdfG -> rdfG
) fawltyContentNTriples
, bench "ntriples-attoparsec" $
nf (\t ->
let res = parseNTriplesStringAttoparsec t :: Either ParseFailure (RDF TList)
in case res of
Left e -> error (show e)
Right rdfG -> rdfG
) fawltyContentNTriples
, bench "turtle-parsec" $
nf (\t ->
let res = parseTurtleStringParsec Nothing Nothing t :: Either ParseFailure (RDF TList)
in case res of
Left e -> error (show e)
Right rdfG -> rdfG
) fawltyContentTurtle
, bench "turtle-attoparsec" $
nf (\t ->
let res = parseTurtleStringAttoparsec Nothing Nothing t :: Either ParseFailure (RDF TList)
in case res of
Left e -> error (show e)
Right rdfG -> rdfG
) fawltyContentTurtle
]
,
bgroup
"query"
(queryBench "TList" triplesList ++
queryBench "AdjHashMap" adjMap
Expand Down
5 changes: 4 additions & 1 deletion rdf4h.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ library
, Text.RDF.RDF4H.NTriplesParser
, Text.RDF.RDF4H.NTriplesSerializer
, Text.RDF.RDF4H.XmlParser
build-depends: base >= 4.8.0.0
build-depends: attoparsec >= 0.13.1.0
, base >= 4.8.0.0
, bytestring
, directory
, containers
Expand All @@ -55,6 +56,8 @@ library
, text-binary
, utf8-string
, hgal
, parsers
, mtl
if impl(ghc < 7.6)
build-depends: ghc-prim

Expand Down
85 changes: 32 additions & 53 deletions src/Data/RDF/Types.hs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}

module Data.RDF.Types (

Expand Down Expand Up @@ -48,7 +46,6 @@ import qualified Data.Text as T
import System.IO
import Text.Printf
import Data.Binary
import Control.Monad (guard)
import Data.Map(Map)
import Data.Maybe (fromJust)
import GHC.Generics (Generic)
Expand All @@ -57,11 +54,14 @@ import qualified Data.List as List
import qualified Data.Map as Map
import qualified Network.URI as Network (uriPath,parseURI)
import Control.DeepSeq (NFData,rnf)
import Text.Parsec
import Text.Parsec.Text
import Text.Parsec(ParseError,parse)
import Network.URI
import Codec.Binary.UTF8.String

import Text.Parser.Char
import Text.Parser.Combinators
import Control.Applicative

-------------------
-- LValue and constructor functions

Expand Down Expand Up @@ -177,19 +177,17 @@ isRdfURI :: T.Text -> Either ParseError T.Text
isRdfURI t = parse (isRdfURIParser <* eof) ("Invalid URI: " ++ T.unpack t) t

-- [18] IRIREF from Turtle spec
isRdfURIParser :: GenParser () T.Text
isRdfURIParser = T.concat <$> many (T.singleton <$> noneOf (['\x00'..'\x20'] ++ [' ','<','>','"','{','}','|','^','`','\\']) <|> nt_uchar)
isRdfURIParser :: CharParsing m => m T.Text
isRdfURIParser = T.concat <$> many (T.singleton <$> noneOf (['\x00'..'\x20'] ++ " <>\"{}|^`\\") <|> nt_uchar)

-- [10] UCHAR
nt_uchar :: GenParser () T.Text
nt_uchar :: CharParsing m => m T.Text
nt_uchar =
(try (char '\\' >> char 'u' >> count 4 hexDigit >>= \cs -> return $ T.pack (uEscapedToXEscaped cs)) <|>
try (char '\\' >> char 'U' >> count 8 hexDigit >>= \cs -> return $ T.pack (uEscapedToXEscaped cs)))
try (T.pack . uEscapedToXEscaped <$> (string "\\u" *> count 4 hexDigit)) <|>
try (T.pack . uEscapedToXEscaped <$> (string "\\U" *> count 8 hexDigit))

uEscapedToXEscaped :: String -> String
uEscapedToXEscaped ss =
let str = ['\\','x'] ++ ss
in read ("\"" ++ str ++ "\"")
uEscapedToXEscaped ss = read ("\"\\x" ++ ss ++ "\"")

-- |Validate a Text URI and return it in a @Just Text@ if it is
-- valid, otherwise @Nothing@ is returned. See 'unodeValidate'.
Expand All @@ -207,47 +205,29 @@ uriValidateString t = case isRdfURIString of
isRdfURIString = parse (isRdfURIParserS <* eof) ("Invalid URI: " ++ t) t
isRdfURIParserS = many (validUriChar <|> nt_ucharS)
nt_ucharS =
(try (char '\\' >> char 'u' >> count 4 hexDigit >>= return . head . uEscapedToXEscaped) <|>
try (char '\\' >> char 'U' >> count 8 hexDigit >>= return . head . uEscapedToXEscaped))
try (head . uEscapedToXEscaped <$> (string "\\u" *> count 4 hexDigit)) <|>
try (head . uEscapedToXEscaped <$> (string "\\U" *> count 8 hexDigit))
-- [18] IRIREF from Turtle spec
validUriChar = try $ do
c <- anyChar
guard $ not (c >= '\x00' && c <= '\x20') && c `notElem` [' ','<','>','"','{','}','|','^','`','\\']
return c
validUriChar = try $ satisfy $ \c ->
not (c >= '\x00' && c <= '\x20')
&& c `notElem` [' ','<','>','"','{','}','|','^','`','\\']

-- | Escapes @\Uxxxxxxxx@ and @\uxxxx@ character sequences according
-- to the RDF specification.
escapeRDFSyntax :: T.Text -> T.Text
escapeRDFSyntax t = T.pack uri
where
Right uri = parse unicodeEscParser "" (T.unpack t)
unicodeEscParser :: Stream s m Char => ParsecT s u m String
unicodeEscParser = do
ss <- many (
try (do { _ <- char '\\'
; _ <- char 'U'
; pos1 <- hexDigit
; pos2 <- hexDigit
; pos3 <- hexDigit
; pos4 <- hexDigit
; pos5 <- hexDigit
; pos6 <- hexDigit
; pos7 <- hexDigit
; pos8 <- hexDigit
; let str = ['\\','x',pos1,pos2,pos3,pos4,pos5,pos6,pos7,pos8]
; return (read ("\"" ++ str ++ "\"") :: String)})
unicodeEscParser :: (CharParsing m, Monad m) => m String
unicodeEscParser =
concat <$> many (
try (do { str <- ("\\x"++) <$> (string "\\U" *> count 8 hexDigit)
; pure (read ("\"" ++ str ++ "\"") :: String)})
<|>
try (do { _ <- char '\\'
; _ <- char 'u'
; pos1 <- hexDigit
; pos2 <- hexDigit
; pos3 <- hexDigit
; pos4 <- hexDigit
; let str = ['\\','x',pos1,pos2,pos3,pos4]
; return (read ("\"" ++ str ++ "\"") :: String)})
<|>
(anyChar >>= \c -> return [c]))
return (concat ss :: String)
try (do { str <- ("\\x"++) <$> (string "\\u" *> count 4 hexDigit)
; pure (read ("\"" ++ str ++ "\"") :: String)})
<|> (pure <$> anyChar)
)


-- |Return a blank node using the given string identifier.
Expand Down Expand Up @@ -335,7 +315,7 @@ data family RDF a
-- For more information about the concept of an RDF graph, see
-- the following: <http://www.w3.org/TR/rdf-concepts/#section-rdf-graph>.
class (Generic rdfImpl, NFData rdfImpl) => Rdf rdfImpl where

-- |Return the base URL of this RDF, if any.
baseUrl :: RDF rdfImpl -> Maybe BaseUrl

Expand Down Expand Up @@ -411,7 +391,7 @@ class (Generic rdfImpl, NFData rdfImpl) => Rdf rdfImpl where
showGraph :: RDF rdfImpl -> String

instance (Rdf a) => Show (RDF a) where
show a = showGraph a
show = showGraph

-- |An RdfParser is a parser that knows how to parse 1 format of RDF and
-- can parse an RDF document of that type from a string, a file, or a URL.
Expand Down Expand Up @@ -747,10 +727,9 @@ _decimalStr s = -- haskell double parser doesn't handle '1.'..,

-- | Removes "file://" schema from URIs in 'UNode' nodes
fileSchemeToFilePath :: Node -> Maybe T.Text
fileSchemeToFilePath (UNode fileScheme) =
if T.pack "file://" `T.isPrefixOf` fileScheme
then fmap (T.pack . Network.uriPath) (Network.parseURI (T.unpack fileScheme))
else if T.pack "http://" `T.isPrefixOf` fileScheme
then fmap (T.pack . Network.uriPath) (Network.parseURI (T.unpack fileScheme))
else Nothing
fileSchemeToFilePath (UNode fileScheme)
| T.pack "file://" `T.isPrefixOf` fileScheme
= fmap (T.pack . Network.uriPath) (Network.parseURI (T.unpack fileScheme))
| T.pack "http://" `T.isPrefixOf` fileScheme
= fmap (T.pack . Network.uriPath) (Network.parseURI (T.unpack fileScheme))
fileSchemeToFilePath _ = Nothing
Loading