-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate-titles.ts
150 lines (139 loc) · 4.11 KB
/
create-titles.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import { existsSync, readFileSync } from 'fs';
import { join } from 'path';
if (!existsSync(join(__dirname, '..', 'config.json'))) {
console.error(
'Config file not found, please create a config.json file in the project root directory',
);
process.exit(1);
}
const CONFIG = JSON.parse(
readFileSync(join(__dirname, '..', 'config.json'), 'utf-8'),
);
import fetch from 'node-fetch';
import { setTimeout } from 'timers/promises';
const PAPERLESS_CREDENTIALS = Buffer.from(
CONFIG.PAPERLESS_CREDENTIALS,
).toString('base64');
const TEMPLATE = `Could you please generate a fitting title for the OCR of this PDF file based on the initial content?
Try to infer the document's topic and provide a short title exlusivly in German.
The title should only include the heading and if possible the date of the document, the title should also *not exceed 100 characters*.
Please format your response in such a way that only the title is included, making it easy to parse.
For example \`Title: "this is where the title goes"\`
Content:
`;
Promise.resolve().then(async () => {
console.log('Fetching documents...');
const documents = await getDocuments();
console.log(`Fetched ${documents.results.length} documents`);
for (const document of documents.results) {
if (
document.notes.some(({ note }: { note: string }) =>
note.includes('Processed(v2)'),
)
) {
continue;
}
await setTimeout(1000);
const newTitle = await createNewTitle(document.content);
if (newTitle) {
if (newTitle.length > 120) {
console.error(`Title too long: ${newTitle}`);
continue;
}
console.log(`New Title: "${newTitle}"`);
await updateDocument({ ...document, title: newTitle });
await addNote(document.id, 'Processed(v2)');
}
}
});
async function getDocuments() {
const response = await fetch(
`${CONFIG.PAPERLESS_API_URL}/documents/?page_size=5000`,
{
headers: {
Authorization: `Basic ${PAPERLESS_CREDENTIALS}`,
},
},
);
if (!response.ok) {
throw new Error(
`Failed to get documents: ${
response.statusText
}\n\n${await response.text()}`,
);
}
return await response.json();
}
async function addNote(documentId: string, note: string) {
const response = await fetch(
`${CONFIG.PAPERLESS_API_URL}/documents/${documentId}/notes/`,
{
headers: {
Authorization: `Basic ${PAPERLESS_CREDENTIALS}`,
'Content-Type': 'application/json',
},
method: 'POST',
body: JSON.stringify({ note }),
},
);
if (!response.ok) {
throw new Error(
`Failed to add note to document: ${
response.statusText
}\n\n${await response.text()}`,
);
}
return await response.json();
}
async function updateDocument(newDocument: any) {
const response = await fetch(
`${CONFIG.PAPERLESS_API_URL}/documents/${newDocument.id}/`,
{
method: 'PUT',
headers: {
Authorization: `Basic ${PAPERLESS_CREDENTIALS}`,
'Content-Type': 'application/json',
},
body: JSON.stringify(newDocument),
},
);
if (!response.ok) {
throw new Error(
`Failed to update document: ${
response.statusText
}\n\n${await response.text()}`,
);
}
return response.json();
}
async function _runCompletion(content: string) {
const response = await fetch(`${CONFIG.OLLAMA_API_URL}/generate`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: CONFIG.OLLAMA_LLM_MODEL,
prompt: (TEMPLATE + content).slice(0, 4000),
stream: false,
options: {
temperature: 0.2,
top_p: 0.5,
seed: 1,
num_predict: 110,
},
}),
});
if (!response.ok) {
throw new Error(`Failed to complete: ${response.statusText}`);
}
return response.json().then((r) => r.response);
}
async function createNewTitle(content: string) {
const newTitle = await _runCompletion(content);
const matches = /\w+: "(.*)"/.exec(newTitle);
if (!matches || matches.length < 2) {
return newTitle.replaceAll(/\"/g, '').trim();
}
return matches[1];
}