Make readFields warn about inconsistent indentation.

This is affect of using indentOfAtLeast method: any indentation greater than current offset is fine. That behavior is desirable to parsing multiline field contents, but it is a bit surprising for fields, which we expect to be aligned. Such insonsistency seems to be always a mistake, and it's easy to fix once a machine points it out.
haskell · May 28, 2023 · 147ed3e · 147ed3e
1 parent 9d8c248
commit 147ed3e
Show file tree

Hide file tree

Showing 9 changed files with 168 additions and 20 deletions.
diff --git a/Cabal-syntax/src/Distribution/Fields/Lexer.hs b/Cabal-syntax/src/Distribution/Fields/Lexer.hs
@@ -172,7 +172,7 @@ alex_actions = array (0 :: Int, 31)
   , (0,alex_action_9)
   ]
 
-{-# LINE 151 "templates/Lexer.x" #-}
+{-# LINE 163 "templates/Lexer.x" #-}
 -- | Tokens of outer cabal file structure. Field values are treated opaquely.
 data Token = TokSym   !ByteString       -- ^ Haskell-like identifier, number or operator
            | TokStr   !ByteString       -- ^ String in quotes
@@ -204,6 +204,9 @@ checkLeadingWhitespace pos len bs
 
 checkWhitespace :: Position -> Int -> ByteString -> Lex Int
 checkWhitespace pos len bs
+    -- UTF8 NBSP is 194 160. This function is called on whitespace bytestrings,
+    -- therefore counting 194 bytes is enough to count non-breaking spaces.
+    -- We subtract the amount of 194 bytes to convert bytes length into char length
     | B.any (== 194) (B.take len bs) = do
         addWarningAt pos LexWarningNBSP
         return $ len - B.count 194 (B.take len bs)
@@ -315,14 +318,21 @@ in_field_layout = 5
 in_section = 6
 alex_action_0 = \pos len _ -> do
               when (len /= 0) $ addWarningAt pos LexWarningBOM
+              setPos pos -- reset position as if BOM didn't exist
               setStartCode bol_section
               lexToken
 alex_action_1 = \pos len inp -> checkWhitespace pos len inp >> adjustPos retPos >> lexToken
-alex_action_3 = \pos len inp -> checkLeadingWhitespace pos len inp >>
+alex_action_3 = \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
+                                     -- len' is character whitespace length (counting nbsp as one)
                                      if B.length inp == len
                                        then return (L pos EOF)
-                                       else setStartCode in_section
-                                         >> return (L pos (Indent len))
+                                       else do
+                                        -- Small hack: if char and byte length mismatch
+                                        -- subtract the difference, so lexToken will count position correctly.
+                                        -- Proper (and slower) fix is to count utf8 length in lexToken
+                                        when (len' /= len) $ adjustPos (incPos (len' - len))
+                                        setStartCode in_section
+                                        return (L pos (Indent len'))
 alex_action_4 = tok  OpenBrace
 alex_action_5 = tok  CloseBrace
 alex_action_8 = toki TokSym
@@ -336,8 +346,13 @@ alex_action_15 = \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexTo
 alex_action_16 = \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
                                   if B.length inp == len
                                     then return (L pos EOF)
-                                    else setStartCode in_field_layout
-                                      >> return (L pos (Indent len'))
+                                    else do
+                                      -- Small hack: if char and byte length mismatch
+                                      -- subtract the difference, so lexToken will count position correctly.
+                                      -- Proper (and slower) fix is to count utf8 length in lexToken
+                                      when (len' /= len) $ adjustPos (incPos (len' - len))
+                                      setStartCode in_field_layout
+                                      return (L pos (Indent len'))
 alex_action_18 = toki TokFieldLine
 alex_action_19 = \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken
 alex_action_20 = \_ _ _ -> setStartCode in_field_braces >> lexToken

diff --git a/Cabal-syntax/src/Distribution/Fields/LexerMonad.hs b/Cabal-syntax/src/Distribution/Fields/LexerMonad.hs
@@ -35,7 +35,7 @@ module Distribution.Fields.LexerMonad (
 import qualified Data.ByteString              as B
 import qualified Data.List.NonEmpty           as NE
 import           Distribution.Compat.Prelude
-import           Distribution.Parsec.Position (Position (..), showPos)
+import           Distribution.Parsec.Position (Position (..), showPos, positionRow)
 import           Distribution.Parsec.Warning  (PWarnType (..), PWarning (..))
 import           Prelude ()
 
@@ -65,9 +65,10 @@ instance Monad Lex where
 data LexResult a = LexResult {-# UNPACK #-} !LexState a
 
 data LexWarningType
-    = LexWarningNBSP  -- ^ Encountered non breaking space
-    | LexWarningBOM   -- ^ BOM at the start of the cabal file
-    | LexWarningTab   -- ^ Leading tags
+    = LexWarningNBSP             -- ^ Encountered non breaking space
+    | LexWarningBOM              -- ^ BOM at the start of the cabal file
+    | LexWarningTab              -- ^ Leading tags
+    | LexInconsistentIndentation -- ^ indentation decreases
   deriving (Eq, Ord, Show)
 
 data LexWarning = LexWarning                !LexWarningType
@@ -78,7 +79,7 @@ toPWarnings :: [LexWarning] -> [PWarning]
 toPWarnings
     = map (uncurry toWarning)
     . Map.toList
-    . Map.fromListWith (<>)
+    . Map.fromListWith (flip (<>)) -- fromListWith gives existing element first.
     . map (\(LexWarning t p) -> (t, pure p))
   where
     toWarning LexWarningBOM poss =
@@ -87,6 +88,8 @@ toPWarnings
         PWarning PWTLexNBSP (NE.head poss) $ "Non breaking spaces at " ++ intercalate ", " (NE.toList $ fmap showPos poss)
     toWarning LexWarningTab poss =
         PWarning PWTLexTab (NE.head poss) $ "Tabs used as indentation at " ++ intercalate ", " (NE.toList $ fmap showPos poss)
+    toWarning LexInconsistentIndentation poss =
+        PWarning PWTInconsistentIndentation (NE.head poss) $ "Inconsistent indentation. Indentation jumps at lines " ++ intercalate ", " (NE.toList $ fmap (show . positionRow) poss)
 
 data LexState = LexState {
         curPos   :: {-# UNPACK #-} !Position,        -- ^ position at current input location

diff --git a/Cabal-syntax/src/Distribution/Fields/Parser.hs b/Cabal-syntax/src/Distribution/Fields/Parser.hs
@@ -33,8 +33,8 @@ import           Distribution.Compat.Prelude
 import           Distribution.Fields.Field
 import           Distribution.Fields.Lexer
 import           Distribution.Fields.LexerMonad
-                 (LexResult (..), LexState (..), LexWarning (..), unLex)
-import           Distribution.Parsec.Position   (Position (..))
+                 (LexResult (..), LexState (..), LexWarning (..), LexWarningType (..), unLex)
+import           Distribution.Parsec.Position   (Position (..), positionCol)
 import           Prelude ()
 import           Text.Parsec.Combinator         hiding (eof, notFollowedBy)
 import           Text.Parsec.Error
@@ -318,11 +318,37 @@ readFields' s = do
   where
     parser = do
         fields <- cabalStyleFile
-        ws     <- getLexerWarnings
-        pure (fields, ws)
+        ws     <- getLexerWarnings -- lexer accumulates warnings in reverse (consing them to the list)
+        pure (fields, reverse ws ++ checkIndentation fields [])
 
     lexSt = mkLexState' (mkLexState s)
 
+-- | Check (recursively) that all fields inside a block are indented the same.
+--
+-- We have to do this as a post-processing check.
+-- As the parser uses indentOfAtLeast approach, we don't know what is the "correct"
+-- indentation for following fields.
+--
+-- To catch during parsing we would need to parse first field/section of a section
+-- and then parse the following ones (softly) requiring the exactly the same indentation.
+--
+checkIndentation :: [Field Position] -> [LexWarning] -> [LexWarning]
+checkIndentation []                        = id
+checkIndentation (Field   name _    : fs') =                       checkIndentation' (nameAnn name) fs'
+checkIndentation (Section name _ fs : fs') = checkIndentation fs . checkIndentation' (nameAnn name) fs'
+
+-- | We compare adjacent fields to reduce the amount of reported indentation warnings.
+checkIndentation' :: Position -> [Field Position] -> [LexWarning] -> [LexWarning]
+checkIndentation' _   []                        = id
+checkIndentation' pos (Field   name _    : fs') = checkIndentation'' pos (nameAnn name) .                       checkIndentation' (nameAnn name) fs'
+checkIndentation' pos (Section name _ fs : fs') = checkIndentation'' pos (nameAnn name) . checkIndentation fs . checkIndentation' (nameAnn name) fs'
+
+-- | Check that positions' columns are the same.
+checkIndentation'' :: Position -> Position -> [LexWarning] -> [LexWarning]
+checkIndentation'' a b
+    | positionCol a == positionCol b = id
+    | otherwise                      = (LexWarning LexInconsistentIndentation b :)
+
 #ifdef CABAL_PARSEC_DEBUG
 parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO ()
 parseTest' p fname s =

diff --git a/Cabal-syntax/src/Distribution/Parsec/Warning.hs b/Cabal-syntax/src/Distribution/Parsec/Warning.hs
@@ -43,6 +43,8 @@ data PWarnType
 
     | PWTEmptyFilePath         -- ^ Empty filepath, i.e. literally ""
 
+    | PWTInconsistentIndentation -- ^ sections contents (sections and fields) are indented inconsistently
+
     | PWTExperimental          -- ^ Experimental feature
     deriving (Eq, Ord, Show, Enum, Bounded, Generic)
 

diff --git a/Cabal-tests/tests/CheckTests.hs b/Cabal-tests/tests/CheckTests.hs
@@ -57,6 +57,7 @@ checkTests = testGroup "regressions"
     , checkTest "issue-7776-b.cabal"
     , checkTest "issue-7776-c.cabal"
     , checkTest "issue-8646.cabal"
+    , checkTest "decreasing-indentation.cabal"
     ]
 
 checkTest :: FilePath -> TestTree

diff --git a/Cabal-tests/tests/ParserTests/regressions/decreasing-indentation.cabal b/Cabal-tests/tests/ParserTests/regressions/decreasing-indentation.cabal
@@ -0,0 +1,72 @@
+name:       RSA
+category:   Cryptography, Codec
+version:    1.0.0
+license:    BSD3
+license-file: LICENSE
+author:     Adam Wick <awick@galois.com>
+maintainer: Adam Wick <awick@galois.com>
+stability:  stable
+build-type: Simple
+cabal-version: >= 1.2
+tested-with: GHC ==6.8.0
+synopsis: Implementation of RSA, using the padding schemes of PKCS#1 v2.1.
+description: This library implements the RSA encryption and signature
+             algorithms for arbitrarily-sized ByteStrings. While the
+             implementations work, they are not necessarily the fastest ones
+             on the planet. Particularly key generation. The algorithms
+             included are based of RFC 3447, or the Public-Key Cryptography
+             Standard for RSA, version 2.1 (a.k.a, PKCS#1 v2.1).
+
+Flag IncludeMD5
+  Description: Include support for using MD5 in the various crypto routines.
+
+Flag UseBinary
+  Description: Use the binary package for serializing keys.
+
+Library
+ build-depends: base >= 3
+ if flag(UseBinary)
+   build-depends: binary <10
+   CPP-Options: -DUSE_BINARY
+ if flag(IncludeMD5) && flag(UseBinary)
+   build-depends: pureMD5 <10
+   CPP-Options: -DINCLUDE_MD5
+ exposed-modules: Codec.Crypto.RSA
+
+Executable test_rsa
+    build-depends: base >= 3
+  CPP-Options: -DRSA_TEST
+  Main-Is: Test.hs
+  Other-Modules: Codec.Crypto.RSA
+
+-- The above is actual RSA-1.0.0 cabal file (slightly modified to produce less check warnings)
+
+-- The following sections is further inconsistent indentation examples.
+
+-- Note that here main-is is part of GHC-Options field. (and thus warned about as missing)
+Executable warnings
+    build-depends: base < 5
+  GHC-Options: -Wall
+    main-is: warnings.hs
+  Other-Modules: FooBar
+
+-- Increasing indentation is also possible if we use braces to delimit field contents.
+Executable warnings2
+  build-depends: { base <5 }
+    main-is: { warnings2.hs }
+      Other-Modules: FooBar
+
+-- another common mistake is something like below,
+-- where a sub-section is over-indented
+flag splitBase
+
+Executable warnings3
+    if flag(splitBase)
+        build-depends: base >= 3
+    else
+        build-depends: base <  3
+
+ Main-Is: warnings3.hs
+ Other-Modules:
+  Graphics.UI.WXCore
+  Graphics.UI.WXCore.Wx
diff --git a/Cabal-tests/tests/ParserTests/regressions/decreasing-indentation.check b/Cabal-tests/tests/ParserTests/regressions/decreasing-indentation.check
@@ -0,0 +1,2 @@
+decreasing-indentation.cabal:38:3: Inconsistent indentation. Indentation jumps at lines 38, 49, 56, 57, 69
+No 'main-is' field found for executable warnings
diff --git a/changelog.d/inconsistent-indentation b/changelog.d/inconsistent-indentation
@@ -0,0 +1,12 @@
+synopsis: Warn about inconsistent indentation
+packages: Cabal-syntax
+prs: #8975
+
+description:
+    Make readFields warn about inconsistent indentation.
+
+    This is an effect of using `indentOfAtLeast` method/approach: any indentation greater than current offset is accepted.
+
+    That behavior is desirable to parsing multiline field contents, but it is a bit surprising for fields in sections, which we expect to be aligned.
+
+    Such insonsistency seems to be always a mistake, and it's easy to fix once a machine points it out.
diff --git a/templates/Lexer.x b/templates/Lexer.x
@@ -85,6 +85,7 @@ tokens :-
 <0> {
   @bom?  { \pos len _ -> do
               when (len /= 0) $ addWarningAt pos LexWarningBOM
+              setPos pos -- reset position as if BOM didn't exist
               setStartCode bol_section
               lexToken
          }
@@ -98,11 +99,17 @@ tokens :-
 }
 
 <bol_section> {
-  @nbspspacetab*   { \pos len inp -> checkLeadingWhitespace pos len inp >>
+  @nbspspacetab*   { \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
+                                     -- len' is character whitespace length (counting nbsp as one)
                                      if B.length inp == len
                                        then return (L pos EOF)
-                                       else setStartCode in_section
-                                         >> return (L pos (Indent len)) }
+                                       else do
+                                        -- Small hack: if char and byte length mismatch
+                                        -- subtract the difference, so lexToken will count position correctly.
+                                        -- Proper (and slower) fix is to count utf8 length in lexToken
+                                        when (len' /= len) $ adjustPos (incPos (len' - len))
+                                        setStartCode in_section
+                                        return (L pos (Indent len')) }
   $spacetab* \{    { tok  OpenBrace }
   $spacetab* \}    { tok  CloseBrace }
 }
@@ -126,8 +133,13 @@ tokens :-
   @nbspspacetab* { \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
                                   if B.length inp == len
                                     then return (L pos EOF)
-                                    else setStartCode in_field_layout
-                                      >> return (L pos (Indent len')) }
+                                    else do
+                                      -- Small hack: if char and byte length mismatch
+                                      -- subtract the difference, so lexToken will count position correctly.
+                                      -- Proper (and slower) fix is to count utf8 length in lexToken
+                                      when (len' /= len) $ adjustPos (incPos (len' - len))
+                                      setStartCode in_field_layout
+                                      return (L pos (Indent len')) }
 }
 
 <in_field_layout> {
@@ -181,6 +193,9 @@ checkLeadingWhitespace pos len bs
 
 checkWhitespace :: Position -> Int -> ByteString -> Lex Int
 checkWhitespace pos len bs
+    -- UTF8 NBSP is 194 160. This function is called on whitespace bytestrings,
+    -- therefore counting 194 bytes is enough to count non-breaking spaces.
+    -- We subtract the amount of 194 bytes to convert bytes length into char length
     | B.any (== 194) (B.take len bs) = do
         addWarningAt pos LexWarningNBSP
         return $ len - B.count 194 (B.take len bs)