From d78fe92cdfd4dcc276e1886c5145e8a4a50e0eef Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 8 Nov 2021 02:26:36 +0900 Subject: [PATCH] Replace XHTML mode with content types https://github.com/dahlia/seonbi/issues/18 --- CHANGES.md | 32 +++++++ app/seonbi-api.hs | 16 +++- app/seonbi.hs | 28 ++++-- demo/src/Demo.elm | 9 +- package.yaml | 1 + scripts/deno/mod.ts | 4 +- scripts/deno/test.ts | 2 +- scripts/showcase-svg/Makefile | 6 +- src/Text/Seonbi/ContentTypes.hs | 125 +++++++++++++++++++++++++++ src/Text/Seonbi/Facade.hs | 41 +++++---- test/Text/Seonbi/ContentTypesSpec.hs | 34 ++++++++ test/Text/Seonbi/FacadeSpec.hs | 2 +- 12 files changed, 262 insertions(+), 38 deletions(-) create mode 100644 src/Text/Seonbi/ContentTypes.hs create mode 100644 test/Text/Seonbi/ContentTypesSpec.hs diff --git a/CHANGES.md b/CHANGES.md index 1da9e74c..6c77b06e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,13 +8,42 @@ To be released. - Since this version, it requires GHC 8.8.* at least, and supports GHC 9.0.* at most. + + - Now supports several content types besides HTML/XHTML. [[#18]] + + The below Haskell APIs changed: + + - Added `Text.Seonbi.ContentTypes` module. + - Added `contentType` field for `Configuration m a`. + - Removed `xhtml` field for `Configuration m a` in favour of + new `contentType` field for the same type. + + The below CLI options changed: + + - Added `-t`/`--content-type` option with the default value `text/html`. + - Removed Removed `-x`/`--xhtml` option in favour of new + `-t`/`--content-type` option. In order to use XHTML mode, give it + `-t application/xhtml+xml` option. + + The below HTTP APIs changed: + + - Added an optional field `"contentType"` with the default value + `"text/html"`. + - Removed `"xhtml"` field in favour of new `"contentType"` field. + In order to use XHTML mode, configure `"contentType"` field with + `"application/xhtml+xml"`. + - Added `Text.Seonbi.Html.Lang` module. + - Some transformations inappropriate for non-Korean contents are no more applied to elements written in other languages than Korean. The below functions respect elements `lang` attributes: [[#10]] + - `Text.Seonbi.Hanja.phoneticizeHanja` - `Text.Seonbi.Punctuation.normalizeStops` + - Removed several functions from `Text.Seonbi.Trie` module: + - `toListBy` - `lookupBy` - `submap` @@ -25,8 +54,10 @@ To be released. - `delete` - `mapBy` - `filterMap` + - `Text.Seonbi.Trie.Trie` type is not an instance of the following typeclasses anymore: + - `Generic a => Generic (Trie a)` - `Binary a => Binary (Trie a)` - `Generic1 Trie` @@ -34,6 +65,7 @@ To be released. - `type Rep1 Trie` [#10]: https://github.com/dahlia/seonbi/issues/10 +[#18]: https://github.com/dahlia/seonbi/issues/18 Version 0.2.3 diff --git a/app/seonbi-api.hs b/app/seonbi-api.hs index dc230498..4cd13feb 100644 --- a/app/seonbi-api.hs +++ b/app/seonbi-api.hs @@ -16,6 +16,7 @@ import Data.Aeson import qualified Data.Aeson.Types import qualified Data.ByteString as B import qualified Data.Map.Strict as M +import qualified Data.Set as S import Data.Text import Data.Text.Encoding import Network.Wai @@ -50,7 +51,7 @@ instance FromJSON Input where , intercalate ", " (M.keys presets') ] Nothing -> do - xhtml' <- v .:? "xhtml" .!= False + contentType' <- v .:? "contentType" .!= "text/html" quote' <- v .:? "quote" cite' <- v .:? "cite" arrow' <- v .:? "arrow" @@ -60,7 +61,7 @@ instance FromJSON Input where hanja' <- v .:? "hanja" .!= Nothing return Configuration { debugLogger = Nothing - , xhtml = xhtml' + , contentType = contentType' , quote = quote' , cite = cite' , arrow = arrow' @@ -71,6 +72,17 @@ instance FromJSON Input where } return $ Input sourceHtml' config +instance FromJSON ContentType where + parseJSON = withText "ContentType" $ \ t -> + if contentTypeFromText t `S.member` contentTypes + then return (contentTypeFromText t) + else fail $ unpack $ Data.Text.concat + [ "Unknown content type: " + , t + , "; available content types: " + , intercalate ", " $ contentTypeText <$> S.elems contentTypes + ] + instance FromJSON QuoteOption instance FromJSON CiteOption instance FromJSON ArrowOption diff --git a/app/seonbi.hs b/app/seonbi.hs index 3d6dda64..0d76a1f4 100644 --- a/app/seonbi.hs +++ b/app/seonbi.hs @@ -25,6 +25,7 @@ import Codec.Text.IConv #endif import Data.ByteString.Lazy import Data.Map.Strict +import qualified Data.Set as S import qualified Data.Text as T import Data.Text.Lazy import Data.Text.Lazy.Encoding @@ -92,7 +93,7 @@ data Seonbi = Seonbi , config :: Configuration IO () , dictionaries :: [FilePath] , noKrStdict :: Bool - , xhtml :: Bool + , contentType' :: ContentType , debug :: Bool , version :: Bool , input :: FilePath @@ -137,8 +138,10 @@ enumKeyword :: (Enum a, Show a) => a -> String enumKeyword = T.unpack . enumKeyword' enumKeywords :: forall a . (Enum a, Show a) => Proxy a -> String -enumKeywords _ = T.unpack $ T.intercalate ", " $ - fmap enumKeyword' [(toEnum 0 :: a) ..] +enumKeywords _ = commas $ enumKeyword' <$> [(toEnum 0 :: a) ..] + +commas :: [T.Text] -> String +commas = T.unpack . T.intercalate ", " parser :: Parser Seonbi parser = Seonbi @@ -166,7 +169,7 @@ parser = Seonbi "Available presets: " ++ Data.List.intercalate ", " (Data.Map.Strict.keys presets')) ) - <|> ( Configuration Nothing False + <|> ( Configuration Nothing "text/html" <$> ( flag' Nothing ( long "no-quote" <> short 'Q' @@ -297,10 +300,14 @@ parser = Seonbi <> help ("Do not use Standard Korean Language Dictionary " ++ "(標準國語大辭典) by South Korean NIKL (國立國語院)") ) - <*> switch - ( long "xhtml" - <> short 'x' - <> help "XHTML mode" + <*> strOption + ( long "content-type" + <> short 't' + <> metavar "TYPE" + <> value "text/html" + <> help ("Content type. Available types: " ++ commas + (contentTypeText <$> S.elems contentTypes) ++ + " [default: text/html]") ) <*> switch ( long "debug" @@ -347,6 +354,7 @@ main = do , config , dictionaries , noKrStdict + , contentType' , debug , version , input @@ -371,6 +379,7 @@ main = do { debugLogger = debugLogger' , hanja = Just hanja' { reading = reading' } } + let configWithContentType = config' { contentType = contentType' } when version $ do Prelude.putStrLn $ showVersion Meta.version exitSuccess @@ -390,7 +399,8 @@ main = do enc -> enc debugPrint ("encoding: " ++ encodingName) result <- catchIOError - (transformHtmlLazyText config' $ toUnicode encodingName contents) + (transformHtmlLazyText configWithContentType $ + toUnicode encodingName contents) (\ e -> hPutStrLn stderr (ioeGetErrorString e) >> exitFailure) let resultBytes = fromUnicode encodingName result if output == "-" diff --git a/demo/src/Demo.elm b/demo/src/Demo.elm index 5772481e..148c0f5a 100644 --- a/demo/src/Demo.elm +++ b/demo/src/Demo.elm @@ -243,7 +243,14 @@ makeInput source = [ ( "preset", Json.Encode.string "ko-kp" ) ] Custom options -> - [ ( "xhtml", Json.Encode.bool options.xhtml ) + [ ( "contentType" + , Json.Encode.string <| + if options.xhtml then + "application/xhtml+xml" + + else + "text/html" + ) , ( "quote" , Json.Encode.string <| case options.quote of diff --git a/package.yaml b/package.yaml index 9b143c75..ea5e0078 100644 --- a/package.yaml +++ b/package.yaml @@ -69,6 +69,7 @@ library: - attoparsec >= 0.12 && < 1 - bytestring-trie >= 0.2.5 && < 0.3 - cassava >= 0.5 && < 0.6 + - case-insensitive >= 1 && < 2 - data-default >= 0.2 && < 1 - filepath >= 1 && < 2 - file-embed >= 0.0.10 && < 0.0.16 diff --git a/scripts/deno/mod.ts b/scripts/deno/mod.ts index fc420ec4..1e2ff3cf 100644 --- a/scripts/deno/mod.ts +++ b/scripts/deno/mod.ts @@ -40,8 +40,8 @@ export type Dictionary = "kr-stdict"; * See also . */ export interface Options { - /** Whether to format the result in XHTML. */ - xhtml: boolean; + /** Content type. */ + type: "text/html" | "application/xhtml+xml"; /** Quoting options. */ quote: | "CurvedQuotes" diff --git a/scripts/deno/test.ts b/scripts/deno/test.ts index 68d5f682..a16c0074 100644 --- a/scripts/deno/test.ts +++ b/scripts/deno/test.ts @@ -8,7 +8,7 @@ import { import { assertEquals } from "https://deno.land/std@0.106.0/testing/asserts.ts"; const hanjaInParens: Options = { - xhtml: false, + contentType: "text/html", quote: "CurvedQuotes", cite: null, arrow: null, diff --git a/scripts/showcase-svg/Makefile b/scripts/showcase-svg/Makefile index b3de9b32..b14ec32a 100644 --- a/scripts/showcase-svg/Makefile +++ b/scripts/showcase-svg/Makefile @@ -17,21 +17,21 @@ build/showcase.svg: template.svg build/ko-kr.html build/ko-kp.html build/ko-kore build/ko-kr.html: build/input.html $(SEONBI) \ --preset ko-kr \ - --xhtml \ + --content-type application/xhtml+xml\ --output build/ko-kr.html \ build/input.html build/ko-kp.html: build/input.html $(SEONBI) \ --preset ko-kp \ - --xhtml \ + --content-type application/xhtml+xml\ --output build/ko-kp.html \ build/input.html build/ko-kore.html: build/input.html $(SEONBI) \ --render-hanja hanja-in-ruby \ - --xhtml \ + --content-type application/xhtml+xml\ --output build/ko-kore.html \ build/input.html diff --git a/src/Text/Seonbi/ContentTypes.hs b/src/Text/Seonbi/ContentTypes.hs new file mode 100644 index 00000000..e684a170 --- /dev/null +++ b/src/Text/Seonbi/ContentTypes.hs @@ -0,0 +1,125 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} +module Text.Seonbi.ContentTypes + ( ContentType + , HtmlTransformer + , TextTransformer + , asHtmlTransformer + , asHtmlTransformer' + , asXhtmlTransformer + , contentTypeFromText + , contentTypes + , contentTypeText + , transformWithContentType + ) where + +#if MIN_VERSION_base(4,13,0) +import Prelude hiding (MonadFail) +#endif + +import Control.Monad.Fail (MonadFail) +import Data.List + +import Data.CaseInsensitive +import Data.Set +import Data.Text as ST +import Data.Text.Lazy as LT + +import Text.Seonbi.Html + +-- | Represents a function that transforms an 'HtmlEntity' list. +type HtmlTransformer m + = (Monad m, MonadFail m) => [HtmlEntity] -> m [HtmlEntity] + +-- | Represents a function that transforms a text. +type TextTransformer m + = (Monad m, MonadFail m) => LT.Text -> m LT.Text + +-- | Represents a function that transforms an 'HtmlTransformer' into +-- a 'TextTransformer'. +type TransformerTransformer m + = (Monad m, MonadFail m) => HtmlTransformer m -> TextTransformer m + +-- | Gets a 'TransformerTransformer' that transforms 'HtmlTransformer' into +-- a 'TextTransformer' which transforms an HTML/XHTML text. +asHtmlTransformer' + :: (Monad m, MonadFail m) + => Bool + -- ^ 'True' for XHTML, and 'False' for HTML. + -> TransformerTransformer m + -- ^ A 'TransformerTransformer' that transforms an 'HtmlTransformer' into + -- a 'TextTransformer' which transforms an HTML/XHTML text. +asHtmlTransformer' xhtml transformer htmlText = do + case scanHtml htmlText of + Done "" input -> do + output <- transformer input + return $ printHtml' output + _ -> + fail "failed to parse input" + where + printHtml' :: [HtmlEntity] -> LT.Text + printHtml' + | xhtml = printXhtml + | otherwise = printHtml + +-- | Transforms an 'HtmlTransformer' into a 'TextTransformer' which transforms +-- an HTML text. +asHtmlTransformer :: (Monad m, MonadFail m) => TransformerTransformer m +asHtmlTransformer = asHtmlTransformer' False + +-- | Transforms an 'HtmlTransformer' into a 'TextTransformer' which transforms +-- an XHTML text. +asXhtmlTransformer :: (Monad m, MonadFail m) => TransformerTransformer m +asXhtmlTransformer = asHtmlTransformer' True + +-- | Represents a case-insensitive content type. +type ContentType = CI ST.Text + +-- | Converts a 'Text' to a 'ContentType'. +contentTypeFromText :: ST.Text -> ContentType +contentTypeFromText = mk + +-- | Converts a 'ContentType' to a 'Text'. +contentTypeText :: ContentType -> ST.Text +contentTypeText = original + +newtype TransformerTransformer' m = + TransformerTransformer' (TransformerTransformer m) +transformers :: (Monad m, MonadFail m) + => [(ContentType, TransformerTransformer' m)] +transformers = + [ ("text/html", TransformerTransformer' asHtmlTransformer) + , ("application/xhtml+xml", TransformerTransformer' asXhtmlTransformer) + ] + +-- | Supported content types. +contentTypes :: Set ContentType +contentTypes = (Data.Set.fromList . Prelude.map fst) + (transformers :: [(ContentType, TransformerTransformer' IO)]) + +getTransformerTransformer :: (Monad m, MonadFail m) + => ContentType + -> Maybe (TransformerTransformer' m) +getTransformerTransformer contentType = + snd <$> Data.List.find ((== contentType) . fst) transformers + +-- | Applies an 'HtmlTransformer' to the given text with respect to the +-- given content type. +transformWithContentType + :: (Monad m, MonadFail m) + => ContentType + -- ^ A content type. If the content type is unsupported (i.e. not in + -- 'contentTypes'), this function fails. + -> HtmlTransformer m + -- ^ An 'HtmlTransformer' to apply. + -> LT.Text + -- ^ A input text to transform. + -> m LT.Text + -- ^ A transformed text. +transformWithContentType contentType transformer inputText = + case getTransformerTransformer contentType of + Nothing -> fail $ ST.unpack $ + "unknown content type: " <> contentTypeText contentType + Just (TransformerTransformer' transformTransformer) -> + transformTransformer transformer inputText diff --git a/src/Text/Seonbi/Facade.hs b/src/Text/Seonbi/Facade.hs index 920df7d9..f7bbe830 100644 --- a/src/Text/Seonbi/Facade.hs +++ b/src/Text/Seonbi/Facade.hs @@ -19,6 +19,11 @@ module Text.Seonbi.Facade , ko_KP , ko_KR , presets + -- * Content types + , ContentType + , contentTypeFromText + , contentTypes + , contentTypeText -- * Dictionaries , HanjaDictionary , readDictionaryFile @@ -66,6 +71,7 @@ import System.FilePath #ifndef EMBED_DICTIONARY import Paths_seonbi (getDataDir) #endif +import Text.Seonbi.ContentTypes import Text.Seonbi.Hanja import Text.Seonbi.Html import Text.Seonbi.Punctuation @@ -79,8 +85,9 @@ import Text.Seonbi.Trie as Trie data Monad m => Configuration m a = Configuration { -- | An optional debugging logger to print its internal AST. debugLogger :: Maybe (HtmlEntity -> m a) - -- | Whether to take and result in XHTML instead of HTML. - , xhtml :: Bool + -- | A content type of the input and output. It has to be a member of + -- 'contentTypes'. + , contentType :: ContentType -- | An option to decide how quotation marks are rendered. -- If 'Nothing' no quotes are transformed. , quote :: Maybe QuoteOption @@ -105,7 +112,7 @@ instance Monad m => Show (Configuration m a) where show c = "Configuration {\n" <> " debugLogger = " <> maybe "Nothing" (const "Just ...") (debugLogger c) <> "," <> - " xhtml = " <> show (xhtml c) <> "," <> + " contentType = " <> show (contentType c) <> "," <> " quote = " <> show (quote c) <> "," <> " arrow = " <> show (cite c) <> "," <> " cite = " <> show (arrow c) <> "," <> @@ -235,8 +242,8 @@ instance Show HanjaReadingOption where show initialSoundLaw <> " }" --- | Transforms a given HTML text. 'Nothing' if it fails to parse the given --- HTML text. +-- | Transforms a given text. 'Nothing' if it fails to parse the given +-- text. transformHtmlText :: forall (m :: Type -> Type) a. (Monad m, MonadFail m) => Configuration m a -> Text -> m Text transformHtmlText config = @@ -245,20 +252,16 @@ transformHtmlText config = -- | A lazy version of 'transformHtmlText' function. transformHtmlLazyText :: forall (m :: Type -> Type) a. (Monad m, MonadFail m) => Configuration m a -> LT.Text -> m LT.Text -transformHtmlLazyText config@Configuration { xhtml, debugLogger } htmlText = - case scanHtml htmlText of - Done "" input -> do - case debugLogger of - Just logger -> mapM_ logger input - Nothing -> return () - return $ printHtml' $ toTransformer config input - _ -> - fail "failed to parse input" +transformHtmlLazyText config@Configuration { contentType, debugLogger } = + transformWithContentType contentType transformerM where - printHtml' :: [HtmlEntity] -> LT.Text - printHtml' - | xhtml = printXhtml - | otherwise = printHtml + transformer :: [HtmlEntity] -> [HtmlEntity] + transformer = toTransformer config + transformerM = case debugLogger of + Nothing -> return <$> transformer + Just logger -> \ input -> do + mapM_ logger input + return $ transformer input toTransformers :: Monad m => Configuration m a -> [[HtmlEntity] -> [HtmlEntity]] toTransformers Configuration { quote @@ -355,7 +358,7 @@ ko_KR = Configuration , initialSoundLaw = True } } - , xhtml = False + , contentType = "text/html" } -- | Preset 'Configuration' for North Korean orthography. diff --git a/test/Text/Seonbi/ContentTypesSpec.hs b/test/Text/Seonbi/ContentTypesSpec.hs new file mode 100644 index 00000000..f42a6841 --- /dev/null +++ b/test/Text/Seonbi/ContentTypesSpec.hs @@ -0,0 +1,34 @@ +{-# LANGUAGE OverloadedStrings #-} +module Text.Seonbi.ContentTypesSpec (spec) where + +import Data.Text + +import Test.Hspec + +import Text.Seonbi.Html +import Text.Seonbi.ContentTypes + +textReverser :: (Monad m, MonadFail m) => HtmlTransformer m +textReverser entities = + return $ reverseText <$> entities + where + reverseText :: HtmlEntity -> HtmlEntity + reverseText e@HtmlText { rawText = t } = e { rawText = Data.Text.reverse t } + reverseText e@HtmlCdata { text = t } = e { text = Data.Text.reverse t } + reverseText e = e + +spec :: Spec +spec = do + specify "asHtmlTransformer" $ do + r <- asHtmlTransformer textReverser "

foo bar
baz

" + r `shouldBe` "

oofrab
zab

" + specify "asXhtmlTransformer" $ do + r <- asXhtmlTransformer textReverser "

foo bar
baz

" + r `shouldBe` "

oofrab
zab

" + specify "transformWithContentType" $ do + let input = "

foo bar

" + h <- transformWithContentType "text/html" textReverser input + h `shouldBe` "

oofrab

" + x <- transformWithContentType "application/xhtml+xml" textReverser input + x `shouldBe` "

oofrab

" + diff --git a/test/Text/Seonbi/FacadeSpec.hs b/test/Text/Seonbi/FacadeSpec.hs index eff8c638..5b572417 100644 --- a/test/Text/Seonbi/FacadeSpec.hs +++ b/test/Text/Seonbi/FacadeSpec.hs @@ -79,6 +79,6 @@ spec = do , emDash = False , stop = Nothing , hanja = Nothing - , xhtml = False + , contentType = "text/html" , debugLogger = Nothing }