Skip to content

Commit

Permalink
Merge branch 'main' into feat/DCP-1814-publish-package-to-gpr
Browse files Browse the repository at this point in the history
Signed-off-by: Vinayak Kulkarni <[email protected]>
  • Loading branch information
Vinayak Kulkarni committed Apr 29, 2024
2 parents 7e9289c + a53e371 commit e0f210a
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 62 deletions.
56 changes: 39 additions & 17 deletions __tests__/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import prefetchedResponse from './sampleResponse.json' assert { type: 'json' };

describe(`#getLinkPreview()`, () => {
it(`should extract link info from just URL`, async () => {
const linkInfo: any = await getLinkPreview(
const linkInfo = await getLinkPreview(
`https://www.youtube.com/watch?v=wuClZjOdT30`,
{ headers: { 'Accept-Language': `en-US` } },
);
Expand All @@ -14,13 +14,13 @@ describe(`#getLinkPreview()`, () => {
expect(linkInfo.title).toEqual(`Geography Now! Germany`);
expect(linkInfo.description).toBeTruthy();
expect(linkInfo.mediaType).toEqual(`video.other`);
expect(linkInfo.images.length).toEqual(1);
expect(linkInfo.images[0]).toEqual(
expect(linkInfo.images!.length).toEqual(1);
expect(linkInfo.images![0]).toEqual(
`https://i.ytimg.com/vi/wuClZjOdT30/maxresdefault.jpg`,
);
expect(linkInfo.videos.length).toEqual(0);
expect(linkInfo.favicons[0]).not.toBe(``);
expect(linkInfo.contentType.toLowerCase()).toEqual(`text/html`);
expect(linkInfo.videos!.length).toEqual(0);
expect(linkInfo.favicons![0]).not.toBe(``);
expect(linkInfo.contentType!.toLowerCase()).toEqual(`text/html`);
expect(linkInfo.charset?.toLowerCase()).toEqual(`utf-8`);
});

Expand Down Expand Up @@ -83,7 +83,7 @@ describe(`#getLinkPreview()`, () => {
);
expect(linkInfo.mediaType).toEqual(`audio`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`audio/mpeg`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`should handle video urls`, async () => {
Expand All @@ -94,7 +94,7 @@ describe(`#getLinkPreview()`, () => {
expect(linkInfo.url).toEqual(`https://www.w3schools.com/html/mov_bbb.mp4`);
expect(linkInfo.mediaType).toEqual(`video`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`video/mp4`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`should handle image urls`, async () => {
Expand All @@ -107,7 +107,7 @@ describe(`#getLinkPreview()`, () => {
);
expect(linkInfo.mediaType).toEqual(`image`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`image/jpeg`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`should handle unknown content type urls`, async () => {
Expand All @@ -128,7 +128,7 @@ describe(`#getLinkPreview()`, () => {
);
expect(linkInfo.mediaType).toEqual(`application`);
expect(linkInfo.contentType?.toLowerCase()).toEqual(`application/pdf`);
expect(linkInfo.favicons[0]).toBeTruthy();
expect(linkInfo.favicons![0]).toBeTruthy();
});

it(`no link in text should fail gracefully`, async () => {
Expand Down Expand Up @@ -236,7 +236,7 @@ describe(`#getLinkPreview()`, () => {
});

it('should handle video tags without type or secure_url tags', async () => {
const res: any = await getLinkPreview(
const res = await getLinkPreview(
`https://newpathtitle.com/falling-markets-how-to-stop-buyer-from-getting-out/`,
{ followRedirects: `follow` },
);
Expand All @@ -247,19 +247,41 @@ describe(`#getLinkPreview()`, () => {
);
expect(res.description).toBeTruthy();
expect(res.mediaType).toEqual(`article`);
expect(res.images.length).toBeGreaterThan(0);
expect(res.videos.length).toBeGreaterThan(0);
expect(res.videos[0].url).toEqual(
expect(res.images!.length).toBeGreaterThan(0);
expect(res.videos!.length).toBeGreaterThan(0);
expect(res.videos![0].url).toEqual(
`https://www.youtube.com/embed/nqNXjxpAPkU`,
);
expect(res.favicons.length).toBeGreaterThan(0);
expect(res.contentType.toLowerCase()).toEqual(`text/html`);
expect(res.favicons!.length).toBeGreaterThan(0);
expect(res.contentType!.toLowerCase()).toEqual(`text/html`);
});

it('should auto detect mp4 even without a content type or file extension', async () => {
const res = await getLinkPreview(
'https://storage.googleapis.com/test-stubs/sample_mp4_without_extension',
);

expect(res.mediaType).toEqual(`video`);
expect(res.contentType).toEqual(`video/mp4`);
});

it('should throw exception if URL is not valid', async () => {
await expect(
getLinkPreview(
'https://storagenotvalid.googleapis.com/test-stubs/sample_mp4_without_extension',
),
).rejects.toThrowErrorMatchingSnapshot();
});
});

describe(`#getPreviewFromContent`, () => {
it(`Basic parsing`, async () => {
const linkInfo: any = await getPreviewFromContent(prefetchedResponse);
const linkInfo: any = await getPreviewFromContent({
...prefetchedResponse,
response: new Response(prefetchedResponse.data, {
headers: prefetchedResponse.headers,
}),
});

expect(linkInfo.url).toEqual(`https://www.youtube.com/watch?v=wuClZjOdT30`);
expect(linkInfo.siteName).toEqual(`YouTube`);
Expand Down
135 changes: 90 additions & 45 deletions index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,43 @@ import cheerio from 'cheerio';
import { fetch } from 'cross-fetch';
import AbortController from 'abort-controller';
import { CONSTANTS } from './constants';
import { fileTypeFromBuffer } from 'file-type';

type ILinkPreviewOptions = {
interface ILinkPreviewOptions {
headers?: Record<string, string>;
imagesPropertyType?: string;
proxyUrl?: string;
timeout?: number;
followRedirects?: `follow` | `error` | `manual`;
resolveDNSHost?: (url: string) => Promise<string>;
handleRedirects?: (baseURL: string, forwardedURL: string) => boolean;
};
}

type IPreFetchedResource = {
headers?: Record<string, string>;
interface IPreFetchedResource {
status?: number;
imagesPropertyType?: string;
proxyUrl?: string;
url: string;
data?: string;
response?: Response;
response: Response;
}

export type LinkPreview = {
url: string;
title?: string;
siteName?: string | undefined;
description?: string | undefined;
mediaType: string;
contentType: string | undefined;
images?: string[];
videos?: {
url: string | undefined;
secureUrl: string | null | undefined;
type: string | null | undefined;
width: string | undefined;
height: string | undefined;
}[];
favicons?: URL[];
charset?: string;
};

const throwOnLoopback = (address: string) => {
Expand Down Expand Up @@ -63,7 +81,11 @@ const getDescription = (doc: cheerio.Root) => {
return description;
};

const getMediaType = (doc: cheerio.Root) => {
/**
*
* @param doc
*/
function getMediaType(doc: cheerio.Root) {
const node = metaTag(doc, `medium`, `name`);
if (node) {
const content = node.attr(`content`);
Expand All @@ -73,7 +95,7 @@ const getMediaType = (doc: cheerio.Root) => {
metaTagContent(doc, `og:type`, `property`) ||
metaTagContent(doc, `og:type`, `name`)
);
};
}

const getImages = (
doc: cheerio.Root,
Expand Down Expand Up @@ -274,7 +296,7 @@ const parseTextResponse = (
url: string,
options: ILinkPreviewOptions = {},
contentType?: string,
) => {
): LinkPreview => {
const doc = cheerio.load(body);

return {
Expand All @@ -290,48 +312,67 @@ const parseTextResponse = (
};
};

// TODO: can use file-type package to determine mime type based on magic numbers
const parseUnknownResponse = (
body: string,
url: string,
options: ILinkPreviewOptions = {},
contentType?: string,
) => {
return parseTextResponse(body, url, options, contentType);
};
/// Read SAMPLE_SIZE bytes for file type as an ArrayBuffer
const readBytesForFileType = async (response: Response) => {
// We get this from the file-type package as the sample size
const SAMPLE_SIZE = 4100;

const getData = async (response: IPreFetchedResource) => {
if (response.data) {
return response.data;
// If the body doesn't have a reader then we use get the array buffer directly from the response
if (!response.body || !response.body.getReader) {
return await response.arrayBuffer();
}

if (response.response) {
return await response.response.text();
const reader = response.body.getReader();

// we use the streaming API to aggregate the first append the first SAMPLE_SIZE bytes
// from the response
const buffer = new Uint8Array(SAMPLE_SIZE);
let offset = 0;
let chunk;
while (!(chunk = await reader.read()).done) {
if (chunk.value.length + offset > SAMPLE_SIZE) {
const subChunk = chunk.value.subarray(0, SAMPLE_SIZE - offset);
buffer.set(subChunk, offset);
offset = SAMPLE_SIZE;
break;
} else {
buffer.set(chunk.value, offset);
offset += chunk.value.length;
}
}

throw new Error(`link-preview-js could not fetch link information`);
return buffer.subarray(0, offset);
};

const parseResponse = async (
response: IPreFetchedResource,
options?: ILinkPreviewOptions,
) => {
): Promise<LinkPreview> => {
if (!response.response.ok) {
throw new Error(
`link-preview-js unexpected status in response ${response.response.status} ${response.response.statusText}`,
);
}

try {
// console.log("[link-preview-js] response", response);
let contentType = response.response
? response.response.headers.get(`content-type`)
: response.headers
? response.headers[`content-type`]
: null;
let contentType = response.response.headers.get(`content-type`);
let contentTypeTokens: string[] = [];
let charset = null;

if (!contentType) {
return parseUnknownResponse(
await getData(response),
response.url,
options,
);
let charset;

// If the content type is sufficiently vague, then use the file type package to
// determine the content type via magic numbers.
if (
!contentType ||
['application/octet-stream', 'video', 'audio'].includes(contentType)
) {
const buffer = await readBytesForFileType(response.response);
const fileType = await fileTypeFromBuffer(buffer);
if (!fileType) {
const text = new TextDecoder().decode(buffer);
return parseTextResponse(text, response.url, options);
} else {
contentType = fileType.mime;
}
}

if (contentType.includes(`;`)) {
Expand Down Expand Up @@ -361,7 +402,7 @@ const parseResponse = async (
if (CONSTANTS.REGEX_CONTENT_TYPE_TEXT.test(contentType)) {
return {
...parseTextResponse(
await getData(response),
await response.response.text(),
response.url,
options,
contentType,
Expand All @@ -378,7 +419,11 @@ const parseResponse = async (
}

return {
...parseUnknownResponse(await getData(response), response.url, options),
...(await parseTextResponse(
await response.response.text(),
response.url,
options,
)),
charset,
};
} catch (e) {
Expand All @@ -390,13 +435,13 @@ const parseResponse = async (
}
};

// Parses the text, extracts the first link it finds and does a HTTP request
// to fetch the website content, afterwards it tries to parse the internal HTML
// and extract the information via meta tags
// Parses the text, extracts the first link it finds and does a HTTP request
// to fetch the website content, afterwards it tries to parse the internal HTML
// and extract the information via meta tags
export const getLinkPreview = async (
text: string,
options?: ILinkPreviewOptions,
) => {
): Promise<LinkPreview> => {
if (!text || typeof text !== `string`) {
throw new Error(`link-preview-js did not receive a valid url or text`);
}
Expand Down

0 comments on commit e0f210a

Please sign in to comment.