Skip to content

[WIP] Add image-text-to-text pipeline #1347

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/models.js
Original file line number Diff line number Diff line change
Expand Up @@ -7842,6 +7842,7 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
['smolvlm', ['SmolVLMForConditionalGeneration', SmolVLMForConditionalGeneration]],
['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
['qwen2_vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
]);

const MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
Expand Down
92 changes: 91 additions & 1 deletion src/pipelines.js
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import {
AutoModelForDepthEstimation,
AutoModelForImageFeatureExtraction,
PreTrainedModel,
AutoModelForImageTextToText,
} from './models.js';
import {
AutoProcessor,
Expand Down Expand Up @@ -1990,6 +1991,85 @@ export class ImageToTextPipeline extends (/** @type {new (options: TextImagePipe
}
}

/**
* @callback ImageTextToTextPipelineCallback Assign labels to the image(s) passed as inputs.
* @param {ImagePipelineInputs} images The images to be captioned.
* TODO: support chat inputs (Chat|Chat[])
* @param {string|string[]} texts The text to be combined with the image. If a list of strings is passed, the length of the list should be the same as the number of images.
* @param {Partial<TextGenerationConfig>} [options] Additional keyword arguments to pass along to the generate method of the model.
* @returns {Promise<TextGenerationOutput|TextGenerationOutput[]>} An object (or array of objects) containing the generated text(s).
*
* @typedef {TextImagePipelineConstructorArgs & ImageTextToTextPipelineCallback & Disposable} ImageTextToTextPipelineType
*/

/**
* Image Text To Text pipeline using a `AutoModelForImageTextToText`. This pipeline infers text from a combination of text and image.
*
* **Example:** TODO
*/
export class ImageTextToTextPipeline extends (/** @type {new (options: TextImagePipelineConstructorArgs) => ImageTextToTextPipelineType} */ (Pipeline)) {

/**
* Create a new ImageToTextPipeline.
* @param {TextImagePipelineConstructorArgs} options An object used to instantiate the pipeline.
*/
constructor(options) {
super(options);
}

/** @type {ImageTextToTextPipelineCallback} */
async _call(images, texts, generate_kwargs = {}) {
const isBatchedImages = Array.isArray(images);
const isBatchedTexts = Array.isArray(texts);
const isBatched = isBatchedImages && isBatchedTexts;

if (isBatchedImages !== isBatchedTexts) {
throw Error("ImageTextToTextPipeline: If images are batched, texts must also be batched and vice versa.");
}

if (isBatched && images.length !== texts.length) {
throw Error("ImageTextToTextPipeline: If the images and texts are batched, they must have the same length.");
}

if (isBatched) {
// TODO: support batches
throw Error("ImageTextToTextPipeline: Batching is not supported yet.");
}

// if (isChat(texts) || isChat(images)) {
// // TODO: support chat
// throw Error("ImageTextToTextPipeline: Chat is not supported yet.");
// }
texts = !Array.isArray(texts) ? [texts] : texts;
const preparedTexts = texts.map((text, index) => {
const conversation = [
{
role: "user",
content: "<|image_pad|>"
},
{
role: "user",
content: text
}
]
return this.processor.apply_chat_template(conversation, { add_generation_prompt: true, tokenize: false })
});
const preparedImages = await prepareImages(images);

// const image_inputs = await this.processor(preparedImages);
// const text_inputs = await this.tokenizer(preparedTexts);
const inputs = await this.processor(preparedTexts, preparedImages);

const outputs = await this.model.generate({...inputs, ...generate_kwargs, max_new_tokens: 128,});

const decoded = this.tokenizer.batch_decode(/** @type {Tensor} */(outputs), {
skip_special_tokens: true,
})

return decoded.map(x => ({ generated_text: x.trim() }));
}
}

/**
* @typedef {Object} ImageClassificationSingle
* @property {string} label The label identified by the model.
Expand Down Expand Up @@ -3203,7 +3283,16 @@ const SUPPORTED_TASKS = Object.freeze({
},
"type": "multimodal",
},

"image-text-to-text": {
"tokenizer": AutoTokenizer,
"pipeline": ImageTextToTextPipeline,
"model": AutoModelForImageTextToText,
"processor": AutoProcessor,
"default": {
"model": "onnx-community/Qwen2-VL-2B-Instruct"
},
"type": "multimodal",
},
"image-classification": {
// no tokenizer
"pipeline": ImageClassificationPipeline,
Expand Down Expand Up @@ -3375,6 +3464,7 @@ const TASK_ALIASES = Object.freeze({
* - `"image-classification"`: will return a `ImageClassificationPipeline`.
* - `"image-segmentation"`: will return a `ImageSegmentationPipeline`.
* - `"image-to-text"`: will return a `ImageToTextPipeline`.
* - `"image-text-to-text"`: will return a `ImageTextToTextPipeline`.
* - `"object-detection"`: will return a `ObjectDetectionPipeline`.
* - `"question-answering"`: will return a `QuestionAnsweringPipeline`.
* - `"summarization"`: will return a `SummarizationPipeline`.
Expand Down
33 changes: 33 additions & 0 deletions tests/pipelines/test_pipelines_image_text_to_text.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import { pipeline, ImageTextToTextPipeline } from "../../src/transformers.js";

import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../init.js";
import { load_cached_image } from "../asset_cache.js";

const PIPELINE_ID = "image-text-to-text";

export default () => {
describe("Image Text to Text", () => {
const model_id = "onnx-community/Qwen2-VL-2B-Instruct";
//TODO: Looks like this model is too big and is triggering timeout. Use smaller model.
/** @type {ImageTextToTextPipeline} */
let pipe;
let images;
beforeAll(async () => {
pipe = await pipeline(PIPELINE_ID, model_id, DEFAULT_MODEL_OPTIONS);
images = await Promise.all([load_cached_image("white_image"), load_cached_image("blue_image")]);
texts = ["What is the color of the image?", "What is the color of the image?"];
}, MAX_MODEL_LOAD_TIME);

it("should be an instance of ImageTextToTextPipeline", () => {
expect(pipe).toBeInstanceOf(ImageTextToTextPipeline);
});

describe("batch_size=1", () => {
it("default", async () => {
const output = await pipe(images[0], texts[0]);
const target = [{ generated_text: "" }]; //TODO: What should I put here? Will depend on the model...
expect(output).toEqual(target);
}, MAX_TEST_EXECUTION_TIME);
});
});
}