Skip to content

Commit

Permalink
Make readFields warn about inconsistent indentation.
Browse files Browse the repository at this point in the history
This is affect of using indentOfAtLeast method:
any indentation greater than current offset is fine.

That behavior is desirable to parsing multiline field contents,
but it is a bit surprising for fields, which we expect to be aligned.

Such insonsistency seems to be always a mistake, and it's easy to fix once
a machine points it out.
  • Loading branch information
phadej committed May 28, 2023
1 parent 9d8c248 commit 147ed3e
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 20 deletions.
27 changes: 21 additions & 6 deletions Cabal-syntax/src/Distribution/Fields/Lexer.hs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ alex_actions = array (0 :: Int, 31)
, (0,alex_action_9)
]

{-# LINE 151 "templates/Lexer.x" #-}
{-# LINE 163 "templates/Lexer.x" #-}
-- | Tokens of outer cabal file structure. Field values are treated opaquely.
data Token = TokSym !ByteString -- ^ Haskell-like identifier, number or operator
| TokStr !ByteString -- ^ String in quotes
Expand Down Expand Up @@ -204,6 +204,9 @@ checkLeadingWhitespace pos len bs

checkWhitespace :: Position -> Int -> ByteString -> Lex Int
checkWhitespace pos len bs
-- UTF8 NBSP is 194 160. This function is called on whitespace bytestrings,
-- therefore counting 194 bytes is enough to count non-breaking spaces.
-- We subtract the amount of 194 bytes to convert bytes length into char length
| B.any (== 194) (B.take len bs) = do
addWarningAt pos LexWarningNBSP
return $ len - B.count 194 (B.take len bs)
Expand Down Expand Up @@ -315,14 +318,21 @@ in_field_layout = 5
in_section = 6
alex_action_0 = \pos len _ -> do
when (len /= 0) $ addWarningAt pos LexWarningBOM
setPos pos -- reset position as if BOM didn't exist
setStartCode bol_section
lexToken
alex_action_1 = \pos len inp -> checkWhitespace pos len inp >> adjustPos retPos >> lexToken
alex_action_3 = \pos len inp -> checkLeadingWhitespace pos len inp >>
alex_action_3 = \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
-- len' is character whitespace length (counting nbsp as one)
if B.length inp == len
then return (L pos EOF)
else setStartCode in_section
>> return (L pos (Indent len))
else do
-- Small hack: if char and byte length mismatch
-- subtract the difference, so lexToken will count position correctly.
-- Proper (and slower) fix is to count utf8 length in lexToken
when (len' /= len) $ adjustPos (incPos (len' - len))
setStartCode in_section
return (L pos (Indent len'))
alex_action_4 = tok OpenBrace
alex_action_5 = tok CloseBrace
alex_action_8 = toki TokSym
Expand All @@ -336,8 +346,13 @@ alex_action_15 = \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexTo
alex_action_16 = \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
if B.length inp == len
then return (L pos EOF)
else setStartCode in_field_layout
>> return (L pos (Indent len'))
else do
-- Small hack: if char and byte length mismatch
-- subtract the difference, so lexToken will count position correctly.
-- Proper (and slower) fix is to count utf8 length in lexToken
when (len' /= len) $ adjustPos (incPos (len' - len))
setStartCode in_field_layout
return (L pos (Indent len'))
alex_action_18 = toki TokFieldLine
alex_action_19 = \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken
alex_action_20 = \_ _ _ -> setStartCode in_field_braces >> lexToken
Expand Down
13 changes: 8 additions & 5 deletions Cabal-syntax/src/Distribution/Fields/LexerMonad.hs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ module Distribution.Fields.LexerMonad (
import qualified Data.ByteString as B
import qualified Data.List.NonEmpty as NE
import Distribution.Compat.Prelude
import Distribution.Parsec.Position (Position (..), showPos)
import Distribution.Parsec.Position (Position (..), showPos, positionRow)
import Distribution.Parsec.Warning (PWarnType (..), PWarning (..))
import Prelude ()

Expand Down Expand Up @@ -65,9 +65,10 @@ instance Monad Lex where
data LexResult a = LexResult {-# UNPACK #-} !LexState a

data LexWarningType
= LexWarningNBSP -- ^ Encountered non breaking space
| LexWarningBOM -- ^ BOM at the start of the cabal file
| LexWarningTab -- ^ Leading tags
= LexWarningNBSP -- ^ Encountered non breaking space
| LexWarningBOM -- ^ BOM at the start of the cabal file
| LexWarningTab -- ^ Leading tags
| LexInconsistentIndentation -- ^ indentation decreases
deriving (Eq, Ord, Show)

data LexWarning = LexWarning !LexWarningType
Expand All @@ -78,7 +79,7 @@ toPWarnings :: [LexWarning] -> [PWarning]
toPWarnings
= map (uncurry toWarning)
. Map.toList
. Map.fromListWith (<>)
. Map.fromListWith (flip (<>)) -- fromListWith gives existing element first.
. map (\(LexWarning t p) -> (t, pure p))
where
toWarning LexWarningBOM poss =
Expand All @@ -87,6 +88,8 @@ toPWarnings
PWarning PWTLexNBSP (NE.head poss) $ "Non breaking spaces at " ++ intercalate ", " (NE.toList $ fmap showPos poss)
toWarning LexWarningTab poss =
PWarning PWTLexTab (NE.head poss) $ "Tabs used as indentation at " ++ intercalate ", " (NE.toList $ fmap showPos poss)
toWarning LexInconsistentIndentation poss =
PWarning PWTInconsistentIndentation (NE.head poss) $ "Inconsistent indentation. Indentation jumps at lines " ++ intercalate ", " (NE.toList $ fmap (show . positionRow) poss)

data LexState = LexState {
curPos :: {-# UNPACK #-} !Position, -- ^ position at current input location
Expand Down
34 changes: 30 additions & 4 deletions Cabal-syntax/src/Distribution/Fields/Parser.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ import Distribution.Compat.Prelude
import Distribution.Fields.Field
import Distribution.Fields.Lexer
import Distribution.Fields.LexerMonad
(LexResult (..), LexState (..), LexWarning (..), unLex)
import Distribution.Parsec.Position (Position (..))
(LexResult (..), LexState (..), LexWarning (..), LexWarningType (..), unLex)
import Distribution.Parsec.Position (Position (..), positionCol)
import Prelude ()
import Text.Parsec.Combinator hiding (eof, notFollowedBy)
import Text.Parsec.Error
Expand Down Expand Up @@ -318,11 +318,37 @@ readFields' s = do
where
parser = do
fields <- cabalStyleFile
ws <- getLexerWarnings
pure (fields, ws)
ws <- getLexerWarnings -- lexer accumulates warnings in reverse (consing them to the list)
pure (fields, reverse ws ++ checkIndentation fields [])

lexSt = mkLexState' (mkLexState s)

-- | Check (recursively) that all fields inside a block are indented the same.
--
-- We have to do this as a post-processing check.
-- As the parser uses indentOfAtLeast approach, we don't know what is the "correct"
-- indentation for following fields.
--
-- To catch during parsing we would need to parse first field/section of a section
-- and then parse the following ones (softly) requiring the exactly the same indentation.
--
checkIndentation :: [Field Position] -> [LexWarning] -> [LexWarning]
checkIndentation [] = id
checkIndentation (Field name _ : fs') = checkIndentation' (nameAnn name) fs'
checkIndentation (Section name _ fs : fs') = checkIndentation fs . checkIndentation' (nameAnn name) fs'

-- | We compare adjacent fields to reduce the amount of reported indentation warnings.
checkIndentation' :: Position -> [Field Position] -> [LexWarning] -> [LexWarning]
checkIndentation' _ [] = id
checkIndentation' pos (Field name _ : fs') = checkIndentation'' pos (nameAnn name) . checkIndentation' (nameAnn name) fs'
checkIndentation' pos (Section name _ fs : fs') = checkIndentation'' pos (nameAnn name) . checkIndentation fs . checkIndentation' (nameAnn name) fs'

-- | Check that positions' columns are the same.
checkIndentation'' :: Position -> Position -> [LexWarning] -> [LexWarning]
checkIndentation'' a b
| positionCol a == positionCol b = id
| otherwise = (LexWarning LexInconsistentIndentation b :)

#ifdef CABAL_PARSEC_DEBUG
parseTest' :: Show a => Parsec LexState' () a -> SourceName -> B8.ByteString -> IO ()
parseTest' p fname s =
Expand Down
2 changes: 2 additions & 0 deletions Cabal-syntax/src/Distribution/Parsec/Warning.hs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ data PWarnType

| PWTEmptyFilePath -- ^ Empty filepath, i.e. literally ""

| PWTInconsistentIndentation -- ^ sections contents (sections and fields) are indented inconsistently

| PWTExperimental -- ^ Experimental feature
deriving (Eq, Ord, Show, Enum, Bounded, Generic)

Expand Down
1 change: 1 addition & 0 deletions Cabal-tests/tests/CheckTests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ checkTests = testGroup "regressions"
, checkTest "issue-7776-b.cabal"
, checkTest "issue-7776-c.cabal"
, checkTest "issue-8646.cabal"
, checkTest "decreasing-indentation.cabal"
]

checkTest :: FilePath -> TestTree
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
name: RSA
category: Cryptography, Codec
version: 1.0.0
license: BSD3
license-file: LICENSE
author: Adam Wick <awick@galois.com>
maintainer: Adam Wick <awick@galois.com>
stability: stable
build-type: Simple
cabal-version: >= 1.2
tested-with: GHC ==6.8.0
synopsis: Implementation of RSA, using the padding schemes of PKCS#1 v2.1.
description: This library implements the RSA encryption and signature
algorithms for arbitrarily-sized ByteStrings. While the
implementations work, they are not necessarily the fastest ones
on the planet. Particularly key generation. The algorithms
included are based of RFC 3447, or the Public-Key Cryptography
Standard for RSA, version 2.1 (a.k.a, PKCS#1 v2.1).

Flag IncludeMD5
Description: Include support for using MD5 in the various crypto routines.

Flag UseBinary
Description: Use the binary package for serializing keys.

Library
build-depends: base >= 3
if flag(UseBinary)
build-depends: binary <10
CPP-Options: -DUSE_BINARY
if flag(IncludeMD5) && flag(UseBinary)
build-depends: pureMD5 <10
CPP-Options: -DINCLUDE_MD5
exposed-modules: Codec.Crypto.RSA

Executable test_rsa
build-depends: base >= 3
CPP-Options: -DRSA_TEST
Main-Is: Test.hs
Other-Modules: Codec.Crypto.RSA

-- The above is actual RSA-1.0.0 cabal file (slightly modified to produce less check warnings)

-- The following sections is further inconsistent indentation examples.

-- Note that here main-is is part of GHC-Options field. (and thus warned about as missing)
Executable warnings
build-depends: base < 5
GHC-Options: -Wall
main-is: warnings.hs
Other-Modules: FooBar

-- Increasing indentation is also possible if we use braces to delimit field contents.
Executable warnings2
build-depends: { base <5 }
main-is: { warnings2.hs }
Other-Modules: FooBar

-- another common mistake is something like below,
-- where a sub-section is over-indented
flag splitBase

Executable warnings3
if flag(splitBase)
build-depends: base >= 3
else
build-depends: base < 3

Main-Is: warnings3.hs
Other-Modules:
Graphics.UI.WXCore
Graphics.UI.WXCore.Wx
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
decreasing-indentation.cabal:38:3: Inconsistent indentation. Indentation jumps at lines 38, 49, 56, 57, 69
No 'main-is' field found for executable warnings
12 changes: 12 additions & 0 deletions changelog.d/inconsistent-indentation
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
synopsis: Warn about inconsistent indentation
packages: Cabal-syntax
prs: #8975

description:
Make readFields warn about inconsistent indentation.

This is an effect of using `indentOfAtLeast` method/approach: any indentation greater than current offset is accepted.

That behavior is desirable to parsing multiline field contents, but it is a bit surprising for fields in sections, which we expect to be aligned.

Such insonsistency seems to be always a mistake, and it's easy to fix once a machine points it out.
25 changes: 20 additions & 5 deletions templates/Lexer.x
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ tokens :-
<0> {
@bom? { \pos len _ -> do
when (len /= 0) $ addWarningAt pos LexWarningBOM
setPos pos -- reset position as if BOM didn't exist
setStartCode bol_section
lexToken
}
Expand All @@ -98,11 +99,17 @@ tokens :-
}
<bol_section> {
@nbspspacetab* { \pos len inp -> checkLeadingWhitespace pos len inp >>
@nbspspacetab* { \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
-- len' is character whitespace length (counting nbsp as one)
if B.length inp == len
then return (L pos EOF)
else setStartCode in_section
>> return (L pos (Indent len)) }
else do
-- Small hack: if char and byte length mismatch
-- subtract the difference, so lexToken will count position correctly.
-- Proper (and slower) fix is to count utf8 length in lexToken
when (len' /= len) $ adjustPos (incPos (len' - len))
setStartCode in_section
return (L pos (Indent len')) }
$spacetab* \{ { tok OpenBrace }
$spacetab* \} { tok CloseBrace }
}
Expand All @@ -126,8 +133,13 @@ tokens :-
@nbspspacetab* { \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
if B.length inp == len
then return (L pos EOF)
else setStartCode in_field_layout
>> return (L pos (Indent len')) }
else do
-- Small hack: if char and byte length mismatch
-- subtract the difference, so lexToken will count position correctly.
-- Proper (and slower) fix is to count utf8 length in lexToken
when (len' /= len) $ adjustPos (incPos (len' - len))
setStartCode in_field_layout
return (L pos (Indent len')) }
}

<in_field_layout> {
Expand Down Expand Up @@ -181,6 +193,9 @@ checkLeadingWhitespace pos len bs

checkWhitespace :: Position -> Int -> ByteString -> Lex Int
checkWhitespace pos len bs
-- UTF8 NBSP is 194 160. This function is called on whitespace bytestrings,
-- therefore counting 194 bytes is enough to count non-breaking spaces.
-- We subtract the amount of 194 bytes to convert bytes length into char length
| B.any (== 194) (B.take len bs) = do
addWarningAt pos LexWarningNBSP
return $ len - B.count 194 (B.take len bs)
Expand Down

0 comments on commit 147ed3e

Please sign in to comment.