Skip to content

(EAI-1044) Add sourceType to all sources #756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jun 2, 2025
4 changes: 2 additions & 2 deletions packages/datasets/src/pageDataset/loadPageDataset.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ describe("loadPagesDataset", () => {
action: "deleted",
sourceType: "tech-docs",
},
// This page does not match sourceType (even though it is active)
// This page does not match sourceType = "tech-docs" (even though it is active)
{
url: "https://example.com/page4",
body: "Page 4 body",
Expand All @@ -56,7 +56,7 @@ describe("loadPagesDataset", () => {
updated: new Date(),
format: "html",
action: "created",
sourceType: "blog",
sourceType: "marketing",
},
{
url: "https://example.com/page5",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
ProjectBase,
removeMarkdownImagesAndLinks,
} from "mongodb-rag-core/dataSources";
import { SourceTypeName } from "./index";

export type DevCenterProjectConfig = ProjectBase & {
type: "devcenter";
Expand Down Expand Up @@ -47,7 +48,7 @@ export const makeDevCenterDataSource = async ({
const collection = db.collection<DevCenterEntry>(collectionName);
const documents = collection.find();

const pages: Page[] = [];
const pages: Page<SourceTypeName>[] = [];
for await (const document of documents) {
if (!document.content) {
logger.warn(
Expand All @@ -69,7 +70,7 @@ export function makeDevCenterPage(
document: DevCenterEntry,
name: string,
baseUrl: string
): Page {
): Page<SourceTypeName> {
assert(document.content, "document.content must be defined");
return {
title: document.name,
Expand Down
74 changes: 46 additions & 28 deletions packages/ingest-mongodb-public/src/sources/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@ export const devCenterProjectConfig: DevCenterProjectConfig = {
connectionUri: DEVCENTER_CONNECTION_URI,
};

/**
Predefined values for sourceType that we want to use in our Pages.
*/
export type SourceTypeName =
| "tech-docs"
| "devcenter"
| "marketing"
| "university-content"
| "tech-docs-external"
| "book-external";

const mongoDbUniversitySourceConstructor = async () => {
const universityDataApiKey = UNIVERSITY_DATA_API_KEY;
assert(!!universityDataApiKey, "UNIVERSITY_DATA_API_KEY required");
Expand All @@ -67,34 +78,37 @@ const mongoDbUniversitySourceConstructor = async () => {
return makeMongoDbUniversityDataSource(universityConfig);
};

export const mongoDbCorpDataSourceConfig: MakeMdOnGithubDataSourceParams = {
name: "mongodb-corp",
repoUrl: "https://github.com/mongodb/chatbot/",
repoLoaderOptions: {
branch: "main",
ignoreFiles: [/^(?!^\/mongodb-corp\/).*/, /^(mongodb-corp\/README\.md)$/],
},
pathToPageUrl(_, frontMatter) {
if (!frontMatter?.url) {
throw new Error("frontMatter.url must be specified");
}
return frontMatter?.url as string;
},
extractMetadata(_, frontMatter) {
if (!frontMatter) {
throw new Error("frontMatter must be specified");
}
const frontMatterCopy = { ...frontMatter };
delete frontMatterCopy.url;
return frontMatterCopy;
},
extractTitle: (_, frontmatter) => (frontmatter?.title as string) ?? null,
};
export const mongoDbCorpDataSourceConfig: MakeMdOnGithubDataSourceParams<SourceTypeName> =
{
name: "mongodb-corp",
repoUrl: "https://github.com/mongodb/chatbot/",
repoLoaderOptions: {
branch: "main",
ignoreFiles: [/^(?!^\/mongodb-corp\/).*/, /^(mongodb-corp\/README\.md)$/],
},
pathToPageUrl(_, frontMatter) {
if (!frontMatter?.url) {
throw new Error("frontMatter.url must be specified");
}
return frontMatter?.url as string;
},
extractMetadata(_, frontMatter) {
if (!frontMatter) {
throw new Error("frontMatter must be specified");
}
const frontMatterCopy = { ...frontMatter };
delete frontMatterCopy.url;
return frontMatterCopy;
},
extractTitle: (_, frontmatter) => (frontmatter?.title as string) ?? null,
};
const mongoDbCorpDataSource = async () => {
return await makeMdOnGithubDataSource(mongoDbCorpDataSourceConfig);
return await makeMdOnGithubDataSource<SourceTypeName>(
mongoDbCorpDataSourceConfig
);
};

export const mongoDbUniMetadataDataSourceConfig: MakeMdOnGithubDataSourceParams =
export const mongoDbUniMetadataDataSourceConfig: MakeMdOnGithubDataSourceParams<SourceTypeName> =
{
name: "university-meta",
repoUrl: "https://github.com/mongodb/chatbot/",
Expand All @@ -117,24 +131,28 @@ export const mongoDbUniMetadataDataSourceConfig: MakeMdOnGithubDataSourceParams
return frontMatterCopy;
},
extractTitle: (_, frontmatter) => (frontmatter?.title as string) ?? null,
sourceType: "university-content",
metadata: {
siteTitle: "MongoDB University",
},
};
const mongoDbUniMetadataSource = async () => {
return await makeMdOnGithubDataSource(mongoDbUniMetadataDataSourceConfig);
return await makeMdOnGithubDataSource<SourceTypeName>(
mongoDbUniMetadataDataSourceConfig
);
};

export const terraformProviderSourceConstructor = async () => {
const siteBaseUrl =
"https://registry.terraform.io/providers/mongodb/mongodbatlas/latest/docs";
return await makeGitDataSource({
return await makeGitDataSource<SourceTypeName>({
name: "atlas-terraform-provider",
repoUri: "https://github.com/mongodb/terraform-provider-mongodbatlas.git",
repoOptions: {
"--depth": 1,
"--branch": "master",
},
sourceType: "tech-docs-external",
metadata: {
productName: "mongodbatlas Terraform Provider",
tags: ["docs", "terraform", "atlas", "hcl"],
Expand All @@ -147,7 +165,7 @@ export const terraformProviderSourceConstructor = async () => {
);
const url = getTerraformPageUrl(siteBaseUrl, path);

const page: Omit<Page, "sourceName"> = {
const page: Omit<Page<SourceTypeName>, "sourceName"> = {
body: removeMarkdownImagesAndLinks(body),
format: "md",
url: url,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
TiCatalogItem,
UniversityVideo,
} from "./MongoDbUniversityDataApiClient";
import { SourceTypeName } from "../index";

export const UNI_BASE_URL = "https://learn.mongodb.com";

Expand All @@ -20,7 +21,7 @@ export function makeUniversityPages({
tiCatalogItems: TiCatalogItem[];
videos: UniversityVideo[];
metadata?: PageMetadata;
}): Page[] {
}): Page<SourceTypeName>[] {
// Create a dictionary of videos keyed by their hashed ID.
// This is used to efficiently look up the video for a lesson.
const videoDict = makeVideosDictionary(videos);
Expand All @@ -44,8 +45,8 @@ function makeCatalogItemPages({
tiCatalogItems: TiCatalogItem[];
videoDict: VideosDict;
metadata?: PageMetadata;
}): Page[] {
const pages: Page[] = [];
}): Page<SourceTypeName>[] {
const pages: Page<SourceTypeName>[] = [];
for (const catalogItem of tiCatalogItems) {
/* Create page for higher level courses.
* Higher level courses are Leanring Paths and Courses that have nested content.
Expand All @@ -56,14 +57,15 @@ function makeCatalogItemPages({
catalogItem.learning_format === "Learning Path" ||
catalogItem.learning_format === "Course"
) {
const page: Page = {
const page: Page<SourceTypeName> = {
sourceName,
url: `${UNI_BASE_URL}/learning-paths/${catalogItem.slug}`,
title: catalogItem.name,
format: "md",
body: generateContentDescriptionMarkdown({
tiCatalogItem: catalogItem,
}),
sourceType: "university-content",
metadata: {
...(metadata ?? {}),
tags: [...(metadata?.tags ?? []), "landing page"],
Expand All @@ -90,7 +92,7 @@ function makeCatalogItemPages({
if (body.length === 0) {
continue;
}
const page: Page = {
const page: Page<SourceTypeName> = {
sourceName,
url: makeUniversityPageUrl({
catalogItemSlug: catalogItem.slug,
Expand All @@ -104,6 +106,7 @@ function makeCatalogItemPages({
}),
format: "txt",
body,
sourceType: "university-content",
metadata: {
...(metadata ?? {}),
// We choose to not include tags returned by the API (i.e.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ interface WebSourceParams extends WebSource {
export function makeWebDataSource({
name,
urls,
sourceType,
staticMetadata,
makeBrowser,
}: WebSourceParams): DataSource {
Expand All @@ -37,6 +38,7 @@ export function makeWebDataSource({
url,
format: "md",
sourceName: name,
sourceType,
...content,
metadata: { ...content.metadata, ...staticMetadata },
});
Expand Down
Loading