[gatsby-transformer-remark] Don't generate AST for same node multiple…

… times in parallel. If we are already generating AST for given node - wait for result of that.
gatsbyjs · Mar 27, 2018 · d12a3b2 · d12a3b2
1 parent 8454195
commit d12a3b2
Showing 1 changed file with 98 additions and 80 deletions.
diff --git a/packages/gatsby-transformer-remark/src/extend-node-type.js b/packages/gatsby-transformer-remark/src/extend-node-type.js
@@ -51,6 +51,14 @@ const tableOfContentsCacheKey = node =>
 const withPathPrefix = (url, pathPrefix) =>
   (pathPrefix + url).replace(/\/\//, `/`)
 
+/**
+ * Map that keeps track of generation of AST to not generate it multiple
+ * times in parallel.
+ *
+ * @type {Map<string,Promise>}
+ */
+const ASTPromiseMap = new Map()
+
 module.exports = (
   { type, store, pathPrefix, getNode, cache, reporter },
   pluginOptions
@@ -87,90 +95,28 @@ module.exports = (
     }
 
     async function getAST(markdownNode) {
-      const cachedAST = await cache.get(astCacheKey(markdownNode))
+      const cacheKey = astCacheKey(markdownNode)
+      const cachedAST = await cache.get(cacheKey)
       if (cachedAST) {
         return cachedAST
+      } else if (ASTPromiseMap.has(cacheKey)) {
+        // We are already generating AST, so let's wait for it
+        return await ASTPromiseMap.get(cacheKey)
       } else {
-        const files = _.values(store.getState().nodes).filter(
-          n => n.internal.type === `File`
-        )
-        const ast = await new Promise((resolve, reject) => {
-          // Use Bluebird's Promise function "each" to run remark plugins serially.
-          Promise.each(pluginOptions.plugins, plugin => {
-            const requiredPlugin = require(plugin.resolve)
-            if (_.isFunction(requiredPlugin.mutateSource)) {
-              return requiredPlugin.mutateSource(
-                {
-                  markdownNode,
-                  files,
-                  getNode,
-                  reporter,
-                },
-                plugin.pluginOptions
-              )
-            } else {
-              return Promise.resolve()
-            }
-          }).then(() => {
-            const markdownAST = remark.parse(markdownNode.internal.content)
-
-            if (pathPrefix) {
-              // Ensure relative links include `pathPrefix`
-              visit(markdownAST, `link`, node => {
-                if (
-                  node.url &&
-                  node.url.startsWith(`/`) &&
-                  !node.url.startsWith(`//`)
-                ) {
-                  node.url = withPathPrefix(node.url, pathPrefix)
-                }
-              })
-            }
-
-            // source => parse (can order parsing for dependencies) => typegen
-            //
-            // source plugins identify nodes, provide id, initial parse, know
-            // when nodes are created/removed/deleted
-            // get passed cached DataTree and return list of clean and dirty nodes.
-            // Also get passed `dirtyNodes` function which they can call with an array
-            // of node ids which will then get re-parsed and the inferred schema
-            // recreated (if inferring schema gets too expensive, can also
-            // cache the schema until a query fails at which point recreate the
-            // schema).
-            //
-            // parse plugins take data from source nodes and extend it, never mutate
-            // it. Freeze all nodes once done so typegen plugins can't change it
-            // this lets us save off the DataTree at that point as well as create
-            // indexes.
-            //
-            // typegen plugins identify further types of data that should be lazily
-            // computed due to their expense, or are hard to infer graphql type
-            // (markdown ast), or are need user input in order to derive e.g.
-            // markdown headers or date fields.
-            //
-            // wrap all resolve functions to (a) auto-memoize and (b) cache to disk any
-            // resolve function that takes longer than ~10ms (do research on this
-            // e.g. how long reading/writing to cache takes), and (c) track which
-            // queries are based on which source nodes. Also if connection of what
-            // which are always rerun if their underlying nodes change..
-            //
-            // every node type in DataTree gets a schema type automatically.
-            // typegen plugins just modify the auto-generated types to add derived fields
-            // as well as computationally expensive fields.
-            const files = _.values(store.getState().nodes).filter(
-              n => n.internal.type === `File`
-            )
+        const ASTGenerationPromise = new Promise(async resolve => {
+          const files = _.values(store.getState().nodes).filter(
+            n => n.internal.type === `File`
+          )
+          const ast = await new Promise((resolve, reject) => {
             // Use Bluebird's Promise function "each" to run remark plugins serially.
             Promise.each(pluginOptions.plugins, plugin => {
               const requiredPlugin = require(plugin.resolve)
-              if (_.isFunction(requiredPlugin)) {
-                return requiredPlugin(
+              if (_.isFunction(requiredPlugin.mutateSource)) {
+                return requiredPlugin.mutateSource(
                   {
-                    markdownAST,
                     markdownNode,
-                    getNode,
                     files,
-                    pathPrefix,
+                    getNode,
                     reporter,
                   },
                   plugin.pluginOptions
@@ -179,14 +125,86 @@ module.exports = (
                 return Promise.resolve()
               }
             }).then(() => {
-              resolve(markdownAST)
+              const markdownAST = remark.parse(markdownNode.internal.content)
+
+              if (pathPrefix) {
+                // Ensure relative links include `pathPrefix`
+                visit(markdownAST, `link`, node => {
+                  if (
+                    node.url &&
+                    node.url.startsWith(`/`) &&
+                    !node.url.startsWith(`//`)
+                  ) {
+                    node.url = withPathPrefix(node.url, pathPrefix)
+                  }
+                })
+              }
+
+              // source => parse (can order parsing for dependencies) => typegen
+              //
+              // source plugins identify nodes, provide id, initial parse, know
+              // when nodes are created/removed/deleted
+              // get passed cached DataTree and return list of clean and dirty nodes.
+              // Also get passed `dirtyNodes` function which they can call with an array
+              // of node ids which will then get re-parsed and the inferred schema
+              // recreated (if inferring schema gets too expensive, can also
+              // cache the schema until a query fails at which point recreate the
+              // schema).
+              //
+              // parse plugins take data from source nodes and extend it, never mutate
+              // it. Freeze all nodes once done so typegen plugins can't change it
+              // this lets us save off the DataTree at that point as well as create
+              // indexes.
+              //
+              // typegen plugins identify further types of data that should be lazily
+              // computed due to their expense, or are hard to infer graphql type
+              // (markdown ast), or are need user input in order to derive e.g.
+              // markdown headers or date fields.
+              //
+              // wrap all resolve functions to (a) auto-memoize and (b) cache to disk any
+              // resolve function that takes longer than ~10ms (do research on this
+              // e.g. how long reading/writing to cache takes), and (c) track which
+              // queries are based on which source nodes. Also if connection of what
+              // which are always rerun if their underlying nodes change..
+              //
+              // every node type in DataTree gets a schema type automatically.
+              // typegen plugins just modify the auto-generated types to add derived fields
+              // as well as computationally expensive fields.
+              const files = _.values(store.getState().nodes).filter(
+                n => n.internal.type === `File`
+              )
+              // Use Bluebird's Promise function "each" to run remark plugins serially.
+              Promise.each(pluginOptions.plugins, plugin => {
+                const requiredPlugin = require(plugin.resolve)
+                if (_.isFunction(requiredPlugin)) {
+                  return requiredPlugin(
+                    {
+                      markdownAST,
+                      markdownNode,
+                      getNode,
+                      files,
+                      pathPrefix,
+                      reporter,
+                    },
+                    plugin.pluginOptions
+                  )
+                } else {
+                  return Promise.resolve()
+                }
+              }).then(() => {
+                resolve(markdownAST)
+              })
             })
           })
-        })
 
-        // Save new AST to cache and return
-        cache.set(astCacheKey(markdownNode), ast)
-        return ast
+          // Save new AST to cache and return
+          cache.set(astCacheKey(markdownNode), ast)
+          // We can now release promise, as we cached result
+          ASTPromiseMap.delete(astCacheKey)
+          return resolve(ast)
+        })
+        ASTPromiseMap.set(cacheKey, ASTGenerationPromise)
+        return await ASTGenerationPromise
       }
     }