@@ -270,8 +270,11 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
270
270
} else if ( session_options . externalData !== undefined ) {
271
271
externalDataPromises = session_options . externalData . map ( async ( ext ) => {
272
272
// if the external data is a string, fetch the file and replace the string with its content
273
+ // @ts -expect-error TS2339
273
274
if ( typeof ext . data === "string" ) {
275
+ // @ts -expect-error TS2339
274
276
const ext_buffer = await getModelFile ( pretrained_model_name_or_path , ext . data , true , options ) ;
277
+ // @ts -expect-error TS2698
275
278
return { ...ext , data : ext_buffer } ;
276
279
}
277
280
return ext ;
@@ -1519,6 +1522,7 @@ export class PreTrainedModel extends Callable {
1519
1522
if ( this . config . model_type === 'musicgen' ) {
1520
1523
// Custom logic (TODO: move to Musicgen class)
1521
1524
decoder_input_ids = Array . from ( {
1525
+ // @ts -expect-error TS2339
1522
1526
length : batch_size * this . config . decoder . num_codebooks
1523
1527
} , ( ) => [ decoder_start_token_id ] ) ;
1524
1528
@@ -1848,11 +1852,13 @@ export class PreTrainedModel extends Callable {
1848
1852
async encode_image ( { pixel_values } ) {
1849
1853
// image_inputs === { pixel_values }
1850
1854
const features = ( await sessionRun ( this . sessions [ 'vision_encoder' ] , { pixel_values } ) ) . image_features ;
1855
+ // @ts -expect-error TS2339
1851
1856
if ( ! this . config . num_image_tokens ) {
1852
1857
console . warn (
1853
1858
'The number of image tokens was not set in the model configuration. ' +
1854
1859
`Setting it to the number of features detected by the vision encoder (${ features . dims [ 1 ] } ).`
1855
1860
)
1861
+ // @ts -expect-error TS2339
1856
1862
this . config . num_image_tokens = features . dims [ 1 ] ;
1857
1863
}
1858
1864
return features ;
@@ -3280,6 +3286,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3280
3286
3281
3287
if ( generation_config . return_token_timestamps ) {
3282
3288
outputs [ "token_timestamps" ] = this . _extract_token_timestamps (
3289
+ // @ts -expect-error TS2345
3283
3290
outputs ,
3284
3291
generation_config . alignment_heads ,
3285
3292
generation_config . num_frames ,
@@ -3315,6 +3322,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3315
3322
) ;
3316
3323
}
3317
3324
3325
+ // @ts -expect-error TS2339
3318
3326
let median_filter_width = this . config . median_filter_width ;
3319
3327
if ( median_filter_width === undefined ) {
3320
3328
console . warn ( "Model config has no `median_filter_width`, using default value of 7." )
@@ -3325,6 +3333,7 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
3325
3333
const batch = generate_outputs . cross_attentions ;
3326
3334
// Create a list with `decoder_layers` elements, each a tensor of shape
3327
3335
// (batch size, attention_heads, output length, input length).
3336
+ // @ts -expect-error TS2339
3328
3337
const cross_attentions = Array . from ( { length : this . config . decoder_layers } ,
3329
3338
// Concatenate the cross attentions for each layer across sequence length dimension.
3330
3339
( _ , i ) => cat ( batch . map ( x => x [ i ] ) , 2 )
@@ -3468,6 +3477,7 @@ export class LlavaForConditionalGeneration extends LlavaPreTrainedModel {
3468
3477
attention_mask,
3469
3478
} ) {
3470
3479
3480
+ // @ts -expect-error TS2339
3471
3481
const image_token_index = this . config . image_token_index ;
3472
3482
3473
3483
const idsList = input_ids . tolist ( ) ;
@@ -6201,10 +6211,12 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel {
6201
6211
6202
6212
const { encoder_outputs, encoder_attention_mask } = await encoderForward ( this , model_inputs ) ;
6203
6213
6214
+ // @ts -expect-error TS2339
6204
6215
const r = encoder_outputs . dims [ 1 ] / this . config . reduction_factor ;
6205
6216
const maxlen = Math . floor ( r * maxlenratio ) ;
6206
6217
const minlen = Math . floor ( r * minlenratio ) ;
6207
6218
6219
+ // @ts -expect-error TS2339
6208
6220
const num_mel_bins = this . config . num_mel_bins ;
6209
6221
6210
6222
let spectrogramParts = [ ] ;
@@ -6569,11 +6581,13 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6569
6581
*/
6570
6582
_apply_and_filter_by_delay_pattern_mask ( outputs ) {
6571
6583
const [ bs_x_codebooks , seqLength ] = outputs . dims ;
6584
+ // @ts -expect-error TS2339
6572
6585
const num_codebooks = this . config . decoder . num_codebooks ;
6573
6586
const upperBound = ( seqLength - num_codebooks ) ;
6574
6587
6575
6588
let newDataSize = 0 ;
6576
6589
for ( let i = 0 ; i < outputs . size ; ++ i ) {
6590
+ // @ts -expect-error TS2339
6577
6591
if ( outputs . data [ i ] === this . config . decoder . pad_token_id ) {
6578
6592
continue ;
6579
6593
}
@@ -6603,7 +6617,9 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
6603
6617
let clonedInputIds = structuredClone ( input_ids ) ;
6604
6618
for ( let i = 0 ; i < clonedInputIds . length ; ++ i ) {
6605
6619
for ( let j = 0 ; j < clonedInputIds [ i ] . length ; ++ j ) {
6620
+ // @ts -expect-error TS2339
6606
6621
if ( ( i % this . config . decoder . num_codebooks ) >= j ) {
6622
+ // @ts -expect-error TS2339
6607
6623
clonedInputIds [ i ] [ j ] = BigInt ( this . config . decoder . pad_token_id ) ;
6608
6624
}
6609
6625
}
@@ -6760,6 +6776,9 @@ export class MultiModalityCausalLM extends MultiModalityPreTrainedModel {
6760
6776
'past_key_values' ,
6761
6777
] ;
6762
6778
6779
+ /**
6780
+ * @param {ConstructorParameters<typeof MultiModalityPreTrainedModel> } args
6781
+ */
6763
6782
constructor ( ...args ) {
6764
6783
super ( ...args ) ;
6765
6784
0 commit comments