Page *page-name* or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated. #46
Labels
bug
Something isn't working
This happens when running pnpm run embeddings locally and when deploying to Vercel. The NEXT_PUBLIC_SUPABASE_ANON_KEY, SUPABASE_SERVICE_ROLE_KEY and OPENAI_KEY are in both the .env and in the Vercel vars.
details: 'TypeError: fetch failed\n' +
' at Object.fetch (node:internal/deps/undici/undici:11576:11)\n' +
' at generateEmbeddings
(lib\generate-embeddings.ts:319:61)\n' +
' at main
lib\generate-embeddings.ts:487:3)',
hint: '',
code: ''
}
Here's the Generate-embeddings.tsx, for reference.
`
import { createClient } from '@supabase/supabase-js'
import { createHash } from 'crypto'
import dotenv from 'dotenv'
import { ObjectExpression } from 'estree'
import { readdir, readFile, stat } from 'fs/promises'
import GithubSlugger from 'github-slugger'
import { Content, Root } from 'mdast'
import { fromMarkdown } from 'mdast-util-from-markdown'
import { mdxFromMarkdown, MdxjsEsm } from 'mdast-util-mdx'
import { toMarkdown } from 'mdast-util-to-markdown'
import { toString } from 'mdast-util-to-string'
import { mdxjs } from 'micromark-extension-mdxjs'
import 'openai'
import { Configuration, OpenAIApi } from 'openai'
import { basename, dirname, join } from 'path'
import { u } from 'unist-builder'
import { filter } from 'unist-util-filter'
import { inspect } from 'util'
import yargs from 'yargs'
dotenv.config()
const ignoredFiles = ['pages/404.mdx']
/**
estree
ObjectExpression
*/
function getObjectFromExpression(node: ObjectExpression) {
return node.properties.reduce<
Record<string, string | number | bigint | true | RegExp | undefined>
}, {})
}
/**
meta
ESM export from the MDX file.*/
function extractMetaExport(mdxTree: Root) {
const metaExportNode = mdxTree.children.find((node): node is MdxjsEsm => {
return (
node.type === 'mdxjsEsm' &&
node.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
node.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
node.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
node.data.estree.body[0].declaration.declarations[0].id.name === 'meta'
)
})
if (!metaExportNode) {
return undefined
}
const objectExpression =
(metaExportNode.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
metaExportNode.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
metaExportNode.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === 'meta' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init?.type ===
'ObjectExpression' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init) ||
undefined
if (!objectExpression) {
return undefined
}
return getObjectFromExpression(objectExpression)
}
/**
Splits a
mdast
tree into multiple trees based ona predicate function. Will include the splitting node
at the beginning of each tree.
Useful to split a markdown file into smaller sections.
*/
function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) {
return tree.children.reduce<Root[]>((trees, node) => {
const [lastTree] = trees.slice(-1)
if (!lastTree || predicate(node)) {
const tree: Root = u('root', [node])
return trees.concat(tree)
}
lastTree.children.push(node)
return trees
}, [])
}
type Meta = ReturnType
type Section = {
content: string
heading?: string
slug?: string
}
type ProcessedMdx = {
checksum: string
meta: Meta
sections: Section[]
}
/**
*/
function processMdxForSearch(content: string): ProcessedMdx {
const checksum = createHash('sha256').update(content).digest('base64')
const mdxTree = fromMarkdown(content, {
extensions: [mdxjs()],
mdastExtensions: [mdxFromMarkdown()],
})
const meta = extractMetaExport(mdxTree)
// Remove all MDX elements from markdown
const mdTree = filter(
mdxTree,
(node) =>
![
'mdxjsEsm',
'mdxJsxFlowElement',
'mdxJsxTextElement',
'mdxFlowExpression',
'mdxTextExpression',
].includes(node.type)
)
if (!mdTree) {
return {
checksum,
meta,
sections: [],
}
}
const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading')
const slugger = new GithubSlugger()
const sections = sectionTrees.map((tree) => {
const [firstNode] = tree.children
})
return {
checksum,
meta,
sections,
}
}
type WalkEntry = {
path: string
parentPath?: string
}
async function walk(dir: string, parentPath?: string): Promise<WalkEntry[]> {
const immediateFiles = await readdir(dir)
const recursiveFiles = await Promise.all(
immediateFiles.map(async (file) => {
const path = join(dir, file)
const stats = await stat(path)
if (stats.isDirectory()) {
// Keep track of document hierarchy (if this dir has corresponding doc file)
const docPath =
${basename(path)}.mdx
)
const flattenedFiles = recursiveFiles.reduce(
(all, folderContents) => all.concat(folderContents),
[]
)
return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path))
}
abstract class BaseEmbeddingSource {
checksum?: string
meta?: Meta
sections?: Section[]
constructor(public source: string, public path: string, public parentPath?: string) {}
abstract load(): Promise<{
checksum: string
meta?: Meta
sections: Section[]
}>
}
class MarkdownEmbeddingSource extends BaseEmbeddingSource {
type: 'markdown' = 'markdown'
constructor(source: string, public filePath: string, public parentFilePath?: string) {
const path = filePath.replace(/^pages/, '').replace(/.mdx?$/, '')
const parentPath = parentFilePath?.replace(/^pages/, '').replace(/.mdx?$/, '')
}
async load() {
const contents = await readFile(this.filePath, 'utf8')
}
}
type EmbeddingSource = MarkdownEmbeddingSource
async function generateEmbeddings() {
const argv = await yargs.option('refresh', {
alias: 'r',
description: 'Refresh data',
type: 'boolean',
}).argv
const shouldRefresh = argv.refresh
if (
!process.env.NEXT_PUBLIC_SUPABASE_URL ||
!process.env.SUPABASE_SERVICE_ROLE_KEY ||
!process.env.OPENAI_KEY
) {
return console.log(
'Environment variables NEXT_PUBLIC_SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation'
)
}
const supabaseClient = createClient(
process.env.NEXT_PUBLIC_SUPABASE_URL,
process.env.SUPABASE_SERVICE_ROLE_KEY,
{
auth: {
persistSession: false,
autoRefreshToken: false,
},
}
)
const embeddingSources: EmbeddingSource[] = [
...(await walk('pages'))
.filter(({ path }) => /.mdx?$/.test(path))
.filter(({ path }) => !ignoredFiles.includes(path))
.map((entry) => new MarkdownEmbeddingSource('guide', entry.path)),
]
console.log(
Discovered ${embeddingSources.length} pages
)if (!shouldRefresh) {
console.log('Checking which pages are new or have changed')
} else {
console.log('Refresh flag set, re-generating all pages')
}
for (const embeddingSource of embeddingSources) {
const { type, source, path, parentPath } = embeddingSource
}
console.log('Embedding generation complete')
}
async function main() {
await generateEmbeddings()
}
main().catch((err) => console.error(err))
`
The text was updated successfully, but these errors were encountered: