Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions server/lib/builders/document-builder.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,31 @@ module.exports = exports = class DocumentBuilder {
removeNonSyndicatableImages() {

const embedsMap = arrayToMap(this.content.embeds);

Array.from(this.contentDocument.getElementsByTagName('img')).forEach(
(el) => {
const imageType = el.getAttribute('data-image-type');

let isFlourishElement = false;
// identify flourish element
const elementSrc = el.getAttribute('src');
if(elementSrc?.includes('public.flourish.studio/')) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment/suggestion: I'll consider a more functional approach instead of imperative style when making changes in the DOM. We could simplify this to something like:

// Determine if it is a Flourish Element
const isFlourishElement = elementSrc?.includes('public.flourish.studio/');
// Determine image type
const imageType = isFlourishElement
? 'graphic'
: el.getAttribute('data-image-type');

This reduces multiple declarations and less to maintain.

isFlourishElement = true;
}

let imageType = el.getAttribute('data-image-type');
if(isFlourishElement) {
imageType = 'graphic';
}

let imageId =
el.getAttribute('data-id') ||
el.getAttribute('data-content-id');

// to handle ids in this format (https://api.ft.com/content/{content_id}})
imageId = imageId.split('/').pop();


if(isFlourishElement) {
const match = elementSrc.match(/\/visualisation\/(\d+)\//);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment: What is the value of elementSrc?

Instead of Regex, we can use the built-in URL parser and then apply a focused regex only to the pathname. Regexes are fragile in case of matching with URL paths, as they will not cover the URL variations:

// Parse the URL
const urlObj = new URL(elementSrc);

imageId = match ? match[1] : null;
}

const imageDetails = embedsMap[imageId];

if (imageType !== 'graphic' || !imageDetails || imageDetails.canBeSyndicated !== 'yes') {
Expand Down
37 changes: 37 additions & 0 deletions server/lib/enrich/article.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,34 @@ const {
const RE_BAD_CHARS = /[^A-Za-z0-9_]/gm;
const RE_SPACE = /\s/gm;

function extractFourishEmbeds(contentHTMLBody) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: extractFlourishEmbeds

try {
// Regex to extract Flourish IDs
const flourishIdRegex = /data-flourish-id="(\d+)"/g;

const flourishEmbeds = [];
let match;

while ((match = flourishIdRegex.exec(contentHTMLBody)) !== null) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment: Regex.exec has a state inside it, explained here. Maybe we can use a simpler approach which doesn't involve any states.

Example:

for (const match of contentHTMLBody.matchAll(regex)) {
  // match[1] will be captured ID I believe
  ids2.push(match[1]);
}

const id = match[1];
const flourishContentUrl = encodeURIComponent(`https://public.flourish.studio/visualisation/${id}/thumbnail?cacheBuster=`);
const proxyUrl = `https://www.ft.com/__origami/service/image/v2/images/raw/${flourishContentUrl}?source=cp-content-pipeline&fit=scale-down&quality=highest&width=1020&dpr=1`;

flourishEmbeds.push({
apiUrl: proxyUrl,
binaryUrl: proxyUrl,
canBeSyndicated: 'yes',
id: id,
type: 'http://www.ft.com/ontology/content/Graphic'
});
}
return flourishEmbeds;

} catch(error){
return null;
}
}

module.exports = exports = function article(content, contract, graphicSyndicationFlag) {
if (!content.content_id) {
content.content_id = path.basename(content.id);
Expand All @@ -28,6 +56,15 @@ module.exports = exports = function article(content, contract, graphicSyndicatio
content.bodyHTML = content.body;
}

const flourishEmbeds = extractFourishEmbeds(content.bodyHTML);
if(flourishEmbeds) {
content.embeds = content.embeds ?? [];
content.embeds.push(...flourishEmbeds);
if(content.contentStats){
content.contentStats.graphics += flourishEmbeds.length;
}
}

content.wordCount = getWordCount(content);
content.hasGraphics = Boolean(content.contentStats && content.contentStats.graphics);

Expand Down
Loading