Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support more Confluence URL formats #2118

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions collector/extensions/resync/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ async function resyncConfluence({ chunkSource }, response) {
const { success, reason, content } = await fetchConfluencePage({
pageUrl: `https:${source.pathname}`, // need to add back the real protocol
baseUrl: source.searchParams.get('baseUrl'),
spaceKey: source.searchParams.get('spaceKey'),
accessToken: source.searchParams.get('token'),
username: source.searchParams.get('username'),
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ class ConfluencePagesLoader {
}
}

// https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
const url = `${this.baseUrl}/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const url = `${this.baseUrl}/wiki/rest/api/content?spaceKey=${this.spaceKey}&limit=${limit}&start=${start}&expand=${this.expand}`;
const data = await this.fetchConfluenceData(url);
if (data.size === 0) {
return [];
Expand Down
150 changes: 45 additions & 105 deletions collector/utils/extensions/Confluence/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const UrlPattern = require("url-pattern");
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { ConfluencePagesLoader } = require("./ConfluenceLoader");
Expand All @@ -13,28 +12,36 @@ const { ConfluencePagesLoader } = require("./ConfluenceLoader");
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadConfluence({ pageUrl, username, accessToken }, response) {
if (!pageUrl || !username || !accessToken) {
async function loadConfluence(
{ baseUrl = null, spaceKey = null, username = null, accessToken = null },
response
) {
if (!baseUrl || !spaceKey || !username || !accessToken) {
return {
success: false,
reason:
"You need either a username and access token, or a personal access token (PAT), to use the Confluence connector.",
};
}

const { valid, result } = validSpaceUrl(pageUrl);
if (!valid) {
if (!validBaseUrl(baseUrl)) {
return {
success: false,
reason:
"Confluence space URL is not in the expected format of one of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/* or https://customDomain/display/~SPACEID/*",
reason: "Provided base URL is not a valid URL.",
};
}

const { apiBase: baseUrl, spaceKey, subdomain } = result;
console.log(`-- Working Confluence ${baseUrl} --`);
if (!spaceKey) {
return {
success: false,
reason: "You need to provide a Confluence space key.",
};
}

const { origin, hostname } = new URL(baseUrl);
console.log(`-- Working Confluence ${origin} --`);
const loader = new ConfluencePagesLoader({
baseUrl,
baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc.
spaceKey,
username,
accessToken,
Expand All @@ -59,7 +66,7 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
};
}
const outFolder = slugify(
`${subdomain}-confluence-${v4().slice(0, 4)}`
`confluence-${origin}-${v4().slice(0, 4)}`
).toLowerCase();

const outFolderPath =
Expand All @@ -80,11 +87,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
id: v4(),
url: doc.metadata.url + ".page",
title: doc.metadata.title || doc.metadata.source,
docAuthor: subdomain,
docAuthor: origin,
description: doc.metadata.title,
docSource: `${subdomain} Confluence`,
docSource: `${origin} Confluence`,
chunkSource: generateChunkSource(
{ doc, baseUrl, accessToken, username },
{ doc, baseUrl: origin, spaceKey, accessToken, username },
response.locals.encryptionWorker
),
published: new Date().toLocaleString(),
Expand Down Expand Up @@ -120,10 +127,11 @@ async function loadConfluence({ pageUrl, username, accessToken }, response) {
async function fetchConfluencePage({
pageUrl,
baseUrl,
spaceKey,
username,
accessToken,
}) {
if (!pageUrl || !baseUrl || !username || !accessToken) {
if (!pageUrl || !baseUrl || !spaceKey || !username || !accessToken) {
return {
success: false,
content: null,
Expand All @@ -132,20 +140,25 @@ async function fetchConfluencePage({
};
}

const { valid, result } = validSpaceUrl(pageUrl);
if (!valid) {
if (!validBaseUrl(baseUrl)) {
return {
success: false,
content: null,
reason:
"Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
reason: "Provided base URL is not a valid URL.",
};
}

if (!spaceKey) {
return {
success: false,
content: null,
reason: "You need to provide a Confluence space key.",
};
}

console.log(`-- Working Confluence Page ${pageUrl} --`);
const { spaceKey } = result;
const loader = new ConfluencePagesLoader({
baseUrl,
baseUrl, // Should be the origin of the baseUrl
spaceKey,
username,
accessToken,
Expand Down Expand Up @@ -190,91 +203,17 @@ async function fetchConfluencePage({
}

/**
* A match result for a url-pattern of a Confluence URL
* @typedef {Object} ConfluenceMatchResult
* @property {string} subdomain - the subdomain of an organization's Confluence space
* @property {string} spaceKey - the spaceKey of an organization that determines the documents to collect.
* @property {string} apiBase - the correct REST API url to use for loader.
*/

/**
* Generates the correct API base URL for interfacing with the Confluence REST API
* depending on the URL pattern being used since there are various ways to host/access a
* Confluence space.
* @param {ConfluenceMatchResult} matchResult - result from `url-pattern`.match
* @param {boolean} isCustomDomain - determines if we need to coerce the subpath of the provided URL
* @returns {string} - the resulting REST API URL
*/
function generateAPIBaseUrl(matchResult = {}, isCustomDomain = false) {
const { subdomain } = matchResult;
if (isCustomDomain) return `https://${subdomain}`;
return `https://${subdomain}.atlassian.net/wiki`;
}

/**
* Validates and parses the correct information from a given Confluence URL
* @param {string} spaceUrl - The organization's Confluence URL to parse
* @returns {{
* valid: boolean,
* result: (ConfluenceMatchResult|null),
* }}
* Validates if the provided baseUrl is a valid URL at all.
* @param {string} baseUrl
* @returns {boolean}
*/
function validSpaceUrl(spaceUrl = "") {
let matchResult;
const patterns = {
default: new UrlPattern(
"https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
),
subdomain: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)*"
),
custom: new UrlPattern(
"https\\://(:subdomain.):domain.:tld/display/(:spaceKey)*"
),
};

// If using the default Atlassian Confluence URL pattern.
// We can proceed because the Library/API can use this base url scheme.
matchResult = patterns.default.match(spaceUrl);
if (matchResult)
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};

// If using a custom subdomain Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the subdomain.
matchResult = patterns.subdomain.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult),
},
};
function validBaseUrl(baseUrl) {
try {
new URL(baseUrl);
return true;
} catch (e) {
return false;
}

// If using a base FQDN Confluence URL pattern.
// We need to attach the customDomain as a property to the match result
// so we can form the correct REST API base from the root domain since /display/ is basically a URL mask.
matchResult = patterns.custom.match(spaceUrl);
if (matchResult) {
return {
valid: matchResult.hasOwnProperty("spaceKey"),
result: {
...matchResult,
apiBase: generateAPIBaseUrl(matchResult, true),
},
};
}

// No match
return { valid: false, result: null };
}

/**
Expand All @@ -286,11 +225,12 @@ function validSpaceUrl(spaceUrl = "") {
* @returns {string}
*/
function generateChunkSource(
{ doc, baseUrl, accessToken, username },
{ doc, baseUrl, spaceKey, accessToken, username },
encryptionWorker
) {
const payload = {
baseUrl,
spaceKey,
token: accessToken,
username,
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ export default function ConfluenceOptions() {
}
);
const { data, error } = await System.dataConnectors.confluence.collect({
pageUrl: form.get("pageUrl"),
baseUrl: form.get("baseUrl"),
spaceKey: form.get("spaceKey"),
username: form.get("username"),
accessToken: form.get("accessToken"),
});
Expand Down Expand Up @@ -56,17 +57,37 @@ export default function ConfluenceOptions() {
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold flex gap-x-2 items-center">
<p className="font-bold text-white">Confluence Page URL</p>
<p className="font-bold text-white">Confluence base URL</p>
</label>
<p className="text-xs font-normal text-white/50">
URL of a page in the Confluence space.
This is the base URL of your Confluence space.
</p>
</div>
<input
type="url"
name="pageUrl"
name="baseUrl"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="https://example.atlassian.net/wiki/spaces/~7120208c08555d52224113949698b933a3bb56/pages/851969/Test+anythingLLM+page"
placeholder="eg: https://example.atlassian.net, http://localhost:8211, etc..."
required={true}
autoComplete="off"
spellCheck={false}
/>
</div>
<div className="flex flex-col pr-10">
<div className="flex flex-col gap-y-1 mb-4">
<label className="text-white text-sm font-bold">
Confluence space key
</label>
<p className="text-xs font-normal text-white/50">
This is the spaces key of your confluence instance that will
be used. Usually begins with ~
</p>
</div>
<input
type="text"
name="spaceKey"
className="bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
placeholder="eg: ~7120208c08555d52224113949698b933a3bb56"
required={true}
autoComplete="off"
spellCheck={false}
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/models/dataConnector.js
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,13 @@ const DataConnector = {
},

confluence: {
collect: async function ({ pageUrl, username, accessToken }) {
collect: async function ({ baseUrl, spaceKey, username, accessToken }) {
return await fetch(`${API_BASE}/ext/confluence`, {
method: "POST",
headers: baseHeaders(),
body: JSON.stringify({
pageUrl,
baseUrl,
spaceKey,
username,
accessToken,
}),
Expand Down