Ability to ignore any whitespace in the formulas (config option ignor…

…eWhiteSpace) * Changed whitespace regex to `/s+` (#877) * Changed whitespace regex -> /s+. * updated CHANGELOG.MD * Add test for unparsing non-break space character * Introduce config param ignoreWhiteSpace * Fix eslint warning * Change the config parameter ignoreWhiteSpace from boolean to 'standard' | 'any' Co-authored-by: Martin Alex Philip Dawson <u1356770@gmail.com>
handsontable · Mar 31, 2022 · c125bf8 · c125bf8
1 parent 1e807ce
commit c125bf8
Show file tree

Hide file tree

Showing 9 changed files with 115 additions and 53 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- Add support to parse formulas with all whitespace characters (controlled by config param `ignoreWhiteSpace`). (#898)
+
 ### Changed
 - **Breaking change**: Removed `gpu.js` dependency and its use. (#812)
 

diff --git a/src/Config.ts b/src/Config.ts
@@ -163,6 +163,18 @@ export interface ConfigParams {
    * @category Formula Syntax
    */
   language: string,
+  /**
+   * Controls the set of whitespace characters that are allowed inside a formula.
+   *
+   * When set to `'standard'`, allows only SPACE (U+0020), CHARACTER TABULATION (U+0009), LINE FEED (U+000A), and CARRIAGE RETURN (U+000D) (compliant with OpenFormula Standard 1.3)
+   *
+   * When set to `'any'`, allows all whitespace characters that would be captured by the `\s` character class of the JavaScript regular expressions.
+   *
+   * @default 'standard'
+   *
+   * @category Formula Syntax
+   */
+  ignoreWhiteSpace: 'standard' | 'any',
   /**
    * Sets year 1900 as a leap year.
    *
@@ -436,6 +448,7 @@ export class Config implements ConfigParams, ParserConfig {
     functionPlugins: [],
     ignorePunctuation: false,
     language: 'enGB',
+    ignoreWhiteSpace: 'standard',
     licenseKey: '',
     leapYear1900: false,
     localeLang: 'en',
@@ -489,6 +502,8 @@ export class Config implements ConfigParams, ParserConfig {
   /** @inheritDoc */
   public readonly language: string
   /** @inheritDoc */
+  public readonly ignoreWhiteSpace: 'standard' | 'any'
+  /** @inheritDoc */
   public readonly licenseKey: string
   /** @inheritDoc */
     // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -564,6 +579,7 @@ export class Config implements ConfigParams, ParserConfig {
       leapYear1900,
       localeLang,
       language,
+      ignoreWhiteSpace,
       licenseKey,
       matchWholeCell,
       arrayColumnSeparator,
@@ -603,6 +619,7 @@ export class Config implements ConfigParams, ParserConfig {
     this.functionArgSeparator = configValueFromParam(functionArgSeparator, 'string', 'functionArgSeparator')
     this.decimalSeparator = configValueFromParam(decimalSeparator, ['.', ','], 'decimalSeparator')
     this.language = configValueFromParam(language, 'string', 'language')
+    this.ignoreWhiteSpace = configValueFromParam(ignoreWhiteSpace, ['standard', 'any'], 'ignoreWhiteSpace')
     this.licenseKey = configValueFromParam(licenseKey, 'string', 'licenseKey')
     this.thousandSeparator = configValueFromParam(thousandSeparator, ['', ',', ' ', '.'], 'thousandSeparator')
     this.arrayColumnSeparator = configValueFromParam(arrayColumnSeparator, [',', ';'], 'arrayColumnSeparator')
@@ -677,8 +694,8 @@ export class Config implements ConfigParams, ParserConfig {
    *
    * @internal
    */
-  public get licenseKeyValidityState() {
-    return privatePool.get(this)!.licenseKeyValidityState
+  public get licenseKeyValidityState(): LicenseKeyValidityState {
+    return (privatePool.get(this) as Config).licenseKeyValidityState
   }
 
   public getConfig(): ConfigParams {

diff --git a/src/parser/FormulaParser.ts b/src/parser/FormulaParser.ts
@@ -94,7 +94,6 @@ import {
   RParen,
   StringLiteral,
   TimesOp,
-  WhiteSpace,
 } from './LexerConfig'
 
 export interface FormulaParserResult {
@@ -894,15 +893,15 @@ export class FormulaLexer {
   private skipWhitespacesInsideRanges(tokens: IToken[]): IToken[] {
     return this.filterTokensByNeighbors(tokens, (previous: IToken, current: IToken, next: IToken) => {
       return (tokenMatcher(previous, CellReference) || tokenMatcher(previous, RangeSeparator))
-        && tokenMatcher(current, WhiteSpace)
+        && tokenMatcher(current, this.lexerConfig.WhiteSpace)
         && (tokenMatcher(next, CellReference) || tokenMatcher(next, RangeSeparator))
     })
   }
 
   private skipWhitespacesBeforeArgSeparators(tokens: IToken[]): IToken[] {
     return this.filterTokensByNeighbors(tokens, (previous: IToken, current: IToken, next: IToken) => {
       return !tokenMatcher(previous, this.lexerConfig.ArgSeparator)
-        && tokenMatcher(current, WhiteSpace)
+        && tokenMatcher(current, this.lexerConfig.WhiteSpace)
         && tokenMatcher(next, this.lexerConfig.ArgSeparator)
     })
   }
@@ -928,7 +927,7 @@ export class FormulaLexer {
   }
 
   private trimTrailingWhitespaces(tokens: IToken[]): IToken[] {
-    if (tokens.length > 0 && tokenMatcher(tokens[tokens.length - 1], WhiteSpace)) {
+    if (tokens.length > 0 && tokenMatcher(tokens[tokens.length - 1], this.lexerConfig.WhiteSpace)) {
       tokens.pop()
     }
     return tokens

diff --git a/src/parser/LexerConfig.ts b/src/parser/LexerConfig.ts
@@ -9,6 +9,8 @@ import {ParserConfig} from './ParserConfig'
 
 export const RANGE_OPERATOR = ':'
 export const ABSOLUTE_OPERATOR = '$'
+export const ALL_WHITESPACE_REGEXP = /\s+/
+export const ODFF_WHITESPACE_REGEXP = /[ \t\n\r]+/
 
 /* arithmetic */
 // abstract for + -
@@ -92,12 +94,6 @@ export const StringLiteral = createToken({name: 'StringLiteral', pattern: /"([^"
 /* error literal */
 export const ErrorLiteral = createToken({name: 'ErrorLiteral', pattern: /#[A-Za-z0-9\/]+[?!]?/})
 
-/* skipping whitespaces */
-export const WhiteSpace = createToken({
-  name: 'WhiteSpace',
-  pattern: /[ \t\n\r]+/,
-})
-
 export interface ILexerConfig {
   ArgSeparator: TokenType,
   NumberLiteral: TokenType,
@@ -108,6 +104,7 @@ export interface ILexerConfig {
   decimalSeparator: '.' | ',',
   ArrayColSeparator: TokenType,
   ArrayRowSeparator: TokenType,
+  WhiteSpace: TokenType,
   maxColumns: number,
   maxRows: number,
 }
@@ -116,7 +113,9 @@ export const buildLexerConfig = (config: ParserConfig): ILexerConfig => {
   const offsetProcedureNameLiteral = config.translationPackage.getFunctionTranslation('OFFSET')
   const errorMapping = config.errorMapping
   const functionMapping = config.translationPackage.buildFunctionMapping()
+  const whitespaceTokenRegexp = config.ignoreWhiteSpace === 'standard' ? ODFF_WHITESPACE_REGEXP : ALL_WHITESPACE_REGEXP
 
+  const WhiteSpace = createToken({ name: 'WhiteSpace', pattern: whitespaceTokenRegexp })
   const ArrayRowSeparator = createToken({name: 'ArrayRowSep', pattern: config.arrayRowSeparator})
   const ArrayColSeparator = createToken({name: 'ArrayColSep', pattern: config.arrayColumnSeparator})
 
@@ -185,6 +184,7 @@ export const buildLexerConfig = (config: ParserConfig): ILexerConfig => {
     OffsetProcedureName,
     ArrayRowSeparator,
     ArrayColSeparator,
+    WhiteSpace,
     allTokens,
     errorMapping,
     functionMapping,

diff --git a/src/parser/ParserConfig.ts b/src/parser/ParserConfig.ts
@@ -11,6 +11,7 @@ export interface ParserConfig {
   decimalSeparator: '.' | ',',
   arrayColumnSeparator: ',' | ';',
   arrayRowSeparator: ';' | '|',
+  ignoreWhiteSpace: 'standard' | 'any',
   translationPackage: TranslationPackage,
   errorMapping: Record<string, TranslatableErrorType>,
   maxColumns: number,

diff --git a/src/parser/ParserWithCaching.ts b/src/parser/ParserWithCaching.ts
@@ -3,7 +3,7 @@
  * Copyright (c) 2021 Handsoncode. All rights reserved.
  */
 
-import {IToken, tokenMatcher} from 'chevrotain'
+import {IToken, tokenMatcher, ILexingResult} from 'chevrotain'
 import {ErrorType, SimpleCellAddress} from '../Cell'
 import {FunctionRegistry} from '../interpreter/FunctionRegistry'
 import {AstNodeType, buildParsingErrorAst, RelativeDependency} from './'
@@ -24,7 +24,6 @@ import {
   ILexerConfig,
   ProcedureName,
   RowRange,
-  WhiteSpace,
 } from './LexerConfig'
 import {ParserConfig} from './ParserConfig'
 import {formatNumber} from './Unparser'
@@ -65,7 +64,7 @@ export class ParserWithCaching {
    * @param formulaAddress - address with regard to which formula should be parsed. Impacts computed addresses in R0C0 format.
    */
   public parse(text: string, formulaAddress: SimpleCellAddress): ParsingResult {
-    const lexerResult = this.lexer.tokenizeFormula(text)
+    const lexerResult = this.tokenizeFormula(text)
 
     if (lexerResult.errors.length > 0) {
       const errors = lexerResult.errors.map((e) =>
@@ -89,7 +88,7 @@ export class ParserWithCaching {
     if (cacheResult !== undefined) {
       ++this.statsCacheUsed
     } else {
-      const processedTokens = bindWhitespacesToTokens(lexerResult.tokens)
+      const processedTokens = this.bindWhitespacesToTokens(lexerResult.tokens)
       const parsingResult = this.formulaParser.parseFromTokens(processedTokens, formulaAddress)
 
       if (parsingResult.errors.length > 0) {
@@ -230,28 +229,32 @@ export class ParserWithCaching {
       }
     }
   }
-}
-
-export function bindWhitespacesToTokens(tokens: IToken[]): IExtendedToken[] {
-  const processedTokens: IExtendedToken[] = []
 
-  const first = tokens[0]
-  if (!tokenMatcher(first, WhiteSpace)) {
-    processedTokens.push(first)
-  }
+  public bindWhitespacesToTokens(tokens: IToken[]): IExtendedToken[] {
+    const processedTokens: IExtendedToken[] = []
 
-  for (let i = 1; i < tokens.length; ++i) {
-    const current = tokens[i] as IExtendedToken
-    if (tokenMatcher(current, WhiteSpace)) {
-      continue
+    const first = tokens[0]
+    if (!tokenMatcher(first, this.lexerConfig.WhiteSpace)) {
+      processedTokens.push(first)
     }
 
-    const previous = tokens[i - 1]
-    if (tokenMatcher(previous, WhiteSpace)) {
-      current.leadingWhitespace = previous
+    for (let i = 1; i < tokens.length; ++i) {
+      const current = tokens[i] as IExtendedToken
+      if (tokenMatcher(current, this.lexerConfig.WhiteSpace)) {
+        continue
+      }
+
+      const previous = tokens[i - 1]
+      if (tokenMatcher(previous, this.lexerConfig.WhiteSpace)) {
+        current.leadingWhitespace = previous
+      }
+      processedTokens.push(current)
     }
-    processedTokens.push(current)
+
+    return processedTokens
   }
 
-  return processedTokens
+  public tokenizeFormula(text: string): ILexingResult {
+    return this.lexer.tokenizeFormula(text)
+  }
 }
diff --git a/test/parser/parser.spec.ts b/test/parser/parser.spec.ts
@@ -164,6 +164,24 @@ describe('ParserWithCaching', () => {
     expect(ast1).toEqual(ast2)
   })
 
+  it('with default config should return error for non-breakable space', () => {
+    const parser = buildEmptyParserWithCaching(new Config())
+
+    const { ast, errors } = parser.parse('=\u00A042', adr('A1'))
+
+    expect(ast.type).toBe(AstNodeType.ERROR)
+    expect(errors[0].type).toBe(ParsingErrorType.LexingError)
+  })
+
+  it('when set ignoreWhiteSpace = \'any\' should accept a non-breakable space', () => {
+    const parser = buildEmptyParserWithCaching(new Config({ ignoreWhiteSpace: 'any' }))
+
+    const { ast } = parser.parse('=\u00A042', adr('A1'))
+
+    expect(ast.type).toEqual(AstNodeType.NUMBER)
+    expect(ast.leadingWhitespace).toEqual('\u00A0')
+  })
+
   it('error literal', () => {
     const parser = buildEmptyParserWithCaching(new Config())
 

diff --git a/test/parser/unparse.spec.ts b/test/parser/unparse.spec.ts
@@ -597,4 +597,18 @@ describe('whitespaces', () => {
 
     expect(unparsed).toEqual(formula)
   })
+
+  it('when ignoreWhiteSpace = \'any\', should unparse a non-breakable space character', () => {
+    const config = new Config({ ignoreWhiteSpace: 'any' })
+    const lexerConfig = buildLexerConfig(config)
+    const parser = buildEmptyParserWithCaching(config, sheetMapping)
+    const unparser = new Unparser(config, lexerConfig, sheetMapping.fetchDisplayName, new NamedExpressions())
+
+    const formula = '=\u00A01'
+    const ast = parser.parse(formula, adr('A1')).ast
+
+    const unparsed = unparser.unparse(ast, adr('A1'))
+
+    expect(unparsed).toEqual(formula)
+  })
 })