JohnSnowLabs · maziyarpanahi · Mar 16, 2025 · Jan 6, 2025 · Jan 8, 2025 · Mar 16, 2025
diff --git a/.../python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBERTaForMultipleChoice.ipynb b/.../python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBERTaForMultipleChoice.ipynb
diff --git a/...transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_XLMRoBERTaForMultipleChoice.ipynb b/...transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_XLMRoBERTaForMultipleChoice.ipynb
diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py
@@ -55,6 +55,7 @@
 from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import *
 from sparknlp.annotator.classifier_dl.camembert_for_zero_shot_classification import *
 from sparknlp.annotator.classifier_dl.bert_for_multiple_choice import *
+from sparknlp.annotator.classifier_dl.xlm_roberta_for_multiple_choice import *
 from sparknlp.annotator.classifier_dl.roberta_for_multiple_choice import *
 from sparknlp.annotator.classifier_dl.distilbert_for_multiple_choice import *
 from sparknlp.annotator.classifier_dl.albert_for_multiple_choice import *
diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_multiple_choice.py
@@ -0,0 +1,149 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from sparknlp.common import *
+
+
+class XlmRoBertaForMultipleChoice(AnnotatorModel,
+                                 HasCaseSensitiveProperties,
+                                 HasBatchedAnnotate,
+                                 HasEngine,
+                                 HasMaxSentenceLengthLimit):
+    """XlmRoBertaForMultipleChoice can load XLM-RoBERTa Models with a span classification head on top for extractive
+    question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start
+    logits and span end logits).
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> spanClassifier = XlmRoBertaForMultipleChoice.pretrained() \\
+    ...     .setInputCols(["document_question", "document_context"]) \\
+    ...     .setOutputCol("answer")
+
+    The default model is ``"xlm_roberta_base_qa_squad2"``, if no name is
+    provided.
+
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?task=Question+Answering>`__.
+
+    To see which models are compatible and how to import them see
+    `Import Transformers into Spark NLP 🚀
+    <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT, DOCUMENT``    ``CHUNK``
+    ====================== ======================
+
+    Parameters
+    ----------
+    batchSize
+        Batch size. Large values allows faster processing but requires more
+        memory, by default 8
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default
+        False
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    maxSentenceLength
+        Max sentence length to process, by default 128
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = MultiDocumentAssembler() \\
+    ...     .setInputCols(["question", "context"]) \\
+    ...     .setOutputCol(["document_question", "document_context"])
+    >>> spanClassifier = XlmRoBertaForMultipleChoice.pretrained() \\
+    ...     .setInputCols(["document_question", "document_context"]) \\
+    ...     .setOutputCol("answer") \\
+    ...     .setCaseSensitive(False)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     spanClassifier
+    ... ])
+    >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("answer.result").show(truncate=False)
+    +--------------------+
+    |result              |
+    +--------------------+
+    |[Clara]             |
+    +--------------------+
+    """
+    name = "XlmRoBertaForMultipleChoice"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.CHUNK
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForMultipleChoice",
+                 java_model=None):
+        super(XlmRoBertaForMultipleChoice, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=8,
+            maxSentenceLength=128,
+            caseSensitive=False
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        XlmRoBertaForMultipleChoice
+            The restored model
+        """
+        from sparknlp.internal import _XlmRoBertaMultipleChoiceLoader
+        jModel = _XlmRoBertaMultipleChoiceLoader(folder, spark_session._jsparkSession)._java_obj
+        return XlmRoBertaForMultipleChoice(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="xlm_roberta_base_mc", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "xlm_roberta_base_qa_squad2"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        XlmRoBertaForMultipleChoice
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(XlmRoBertaForMultipleChoice, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
@@ -561,6 +561,15 @@ def __init__(self, path, jspark):
         )
 
 
+class _XlmRoBertaMultipleChoiceLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_XlmRoBertaMultipleChoiceLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForMultipleChoice.loadSavedModel",
+            path,
+            jspark,
+        )
+
+
 class _XlnetLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
         super(_XlnetLoader, self).__init__(

diff --git a/python/test/annotator/classifier_dl/xlm_roberta_for_multiple_choice_test.py b/python/test/annotator/classifier_dl/xlm_roberta_for_multiple_choice_test.py
@@ -0,0 +1,76 @@
+#  Copyright 2017-2025 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+class XlmRoBertaForMultipleChoiceTestSetup(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.question = "The Eiffel Tower is located in which country?"
+        self.choices = "Germany, France, Italy"
+
+        self.spark = SparkContextForTest.spark
+        empty_df = self.spark.createDataFrame([[""]]).toDF("text")
+
+        document_assembler = MultiDocumentAssembler() \
+            .setInputCols(["question", "context"]) \
+            .setOutputCols(["document_question", "document_context"])
+
+        bert_for_multiple_choice = XlmRoBertaForMultipleChoice.pretrained() \
+            .setInputCols(["document_question", "document_context"]) \
+            .setOutputCol("answer")
+
+        pipeline = Pipeline(stages=[document_assembler, bert_for_multiple_choice])
+
+        self.pipeline_model = pipeline.fit(empty_df)
+
+
+@pytest.mark.slow
+class XlmRoBertaForMultipleChoiceTest(XlmRoBertaForMultipleChoiceTestSetup, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.data = self.spark.createDataFrame([[self.question, self.choices]]).toDF("question","context")
+        self.data.show(truncate=False)
+
+    def test_run(self):
+        result_df = self.pipeline_model.transform(self.data)
+        result_df.show(truncate=False)
+        for row in result_df.collect():
+            self.assertTrue(row["answer"][0].result != "")
+
+
+@pytest.mark.slow
+class LightXlmRoBertaForMultipleChoiceTest(XlmRoBertaForMultipleChoiceTestSetup, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+
+    def runTest(self):
+        light_pipeline = LightPipeline(self.pipeline_model)
+        annotations_result = light_pipeline.fullAnnotate(self.question,self.choices)
+        print(annotations_result)
+        for result in annotations_result:
+            self.assertTrue(result["answer"][0].result != "")
+
+        result = light_pipeline.annotate(self.question,self.choices)
+        print(result)
+        self.assertTrue(result["answer"] != "")
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala
@@ -469,6 +469,92 @@ private[johnsnowlabs] class XlmRoBertaClassification(
     (startScores, endScores)
   }
 
+  override def tagSpanMultipleChoice(batch: Seq[Array[Int]]): Array[Float] = {
+    val logits = detectedEngine match {
+      case ONNX.name => computeLogitsMultipleChoiceWithOnnx(batch)
+      case Openvino.name => computeLogitsMultipleChoiceWithOv(batch)
+    }
+
+    calculateSoftmax(logits)
+  }
+
+  private def computeLogitsMultipleChoiceWithOnnx(batch: Seq[Array[Int]]): Array[Float] = {
+    val sequenceLength = batch.head.length
+    val inputIds = Array(batch.map(x => x.map(_.toLong)).toArray)
+    val attentionMask = Array(
+      batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray)
+    val tokenTypeIds = Array(batch.map(_ => Array.fill(sequenceLength)(0L)).toArray)
+
+    val (ortSession, ortEnv) = onnxWrapper.get.getSession(onnxSessionOptions)
+    val tokenTensors = OnnxTensor.createTensor(ortEnv, inputIds)
+    val maskTensors = OnnxTensor.createTensor(ortEnv, attentionMask)
+
+    val inputs =
+      Map(
+        "input_ids" -> tokenTensors,
+        "attention_mask" -> maskTensors).asJava
+
+    try {
+      val output = ortSession.run(inputs)
+      try {
+
+        val logits = output
+          .get("logits")
+          .get()
+          .asInstanceOf[OnnxTensor]
+          .getFloatBuffer
+          .array()
+
+        tokenTensors.close()
+        maskTensors.close()
+
+        logits
+      } finally if (output != null) output.close()
+    } catch {
+      case e: Exception =>
+        // Log the exception as a warning
+        println("Exception in computeLogitsMultipleChoiceWithOnnx: ", e)
+        // Rethrow the exception to propagate it further
+        throw e
+    }
+  }
+
+  private def computeLogitsMultipleChoiceWithOv(batch: Seq[Array[Int]]): Array[Float] = {
+    val (numChoices, sequenceLength) = (batch.length, batch.head.length)
+    // batch_size, num_choices, sequence_length
+    val shape = Some(Array(1, numChoices, sequenceLength))
+    val (tokenTensors, maskTensors, _) =
+      PrepareEmbeddings.prepareOvLongBatchTensorsWithSegment(
+        batch,
+        sequenceLength,
+        numChoices,
+        sentencePadTokenId,
+        shape)
+
+    val compiledModel = openvinoWrapper.get.getCompiledModel()
+    val inferRequest = compiledModel.create_infer_request()
+    inferRequest.set_tensor("input_ids", tokenTensors)
+    inferRequest.set_tensor("attention_mask", maskTensors)
+
+    inferRequest.infer()
+
+    try {
+      try {
+        val logits = inferRequest
+          .get_output_tensor()
+          .data()
+
+        logits
+      }
+    } catch {
+      case e: Exception =>
+        // Log the exception as a warning
+        logger.warn("Exception in computeLogitsMultipleChoiceWithOv", e)
+        // Rethrow the exception to propagate it further
+        throw e
+    }
+  }
+
   private def computeLogitsWithTF(
       batch: Seq[Array[Int]],
       maxSentenceLength: Int): (Array[Float], Array[Float]) = {