heejbin · KagurazakaHinagi · Jan 19, 2026 · Jan 19, 2026
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,8 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+indent_style = space
+indent_size = 4
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# SCM syntax highlighting & preventing 3-way merges
+pixi.lock merge=binary linguist-language=YAML linguist-generated=true -diff
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -17,7 +17,7 @@ on:
   pull_request:
     branches: [ "main" ]
   schedule:
-    - cron: '39 1 * * 6'
+    - cron: '35 4 * * 3'
 
 jobs:
   analyze:
@@ -43,9 +43,11 @@ jobs:
       fail-fast: false
       matrix:
         include:
+        - language: actions
+          build-mode: none
         - language: python
           build-mode: none
-        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
+        # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'rust', 'swift'
         # Use `c-cpp` to analyze code written in C, C++ or both
         # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
         # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
@@ -57,9 +59,15 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v4
 
+    # Add any setup steps before running the `github/codeql-action/init` action.
+    # This includes steps like installing compilers or runtimes (`actions/setup-node`
+    # or others). This is typically only required for manual builds.
+    # - name: Setup runtime (example)
+    #   uses: actions/setup-example@v1
+
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v3
+      uses: github/codeql-action/init@v4
       with:
         languages: ${{ matrix.language }}
         build-mode: ${{ matrix.build-mode }}
@@ -76,7 +84,8 @@ jobs:
     # to build your code.
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
-    - if: matrix.build-mode == 'manual'
+    - name: Run manual build steps
+      if: matrix.build-mode == 'manual'
       shell: bash
       run: |
         echo 'If you are using a "manual" build mode for one or more of the' \
@@ -87,6 +96,6 @@ jobs:
         exit 1
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v3
+      uses: github/codeql-action/analyze@v4
       with:
         category: "/language:${{matrix.language}}"
diff --git a/.gitignore b/.gitignore
@@ -104,4 +104,6 @@ target/
 # runs
 /runs/
 
-```
+# pixi environments
+.pixi/*
+!.pixi/config.toml
diff --git a/README.md b/README.md
@@ -52,13 +52,17 @@ capability for navigating the dynamic landscape of protein biology.
 <!-- tocstop -->
 
 ## Installation
+> **Note:** The [original conda environment](misc/environment.yml) and `setup.py` has dependency conflicts. This fork has fixed them and unified them with Pixi for smoother setup. Please use the installation instructions below. 
+
+### Pixi Installation
+See official [Pixi installation](https://pixi.prefix.dev/latest/installation/)
+
+### Protnote Environment Activation
+Simply `cd` into the git repo and run:
+```sh
+pixi shell
 ```
-git clone https://github.com/microsoft/protnote.git
-cd protnote
-conda env create -f environment.yml
-conda activate protnote
-pip install -e ./  # make sure ./ is the dir including setup.py
-```
+to activate the environment. When finished, use `exit` to deactivate it.
 
 ## Config
 Most hyperparameters and paths are managed through the `base_config.yaml`. Whenever reasonable, we enforce certain files to be in specific directories to increase consistency and reproducibility. In general, we adhere to the following data argument naming conventions in scripts: 
@@ -77,7 +81,7 @@ We train and test ProtNote with protein sequences from the SwissProt section of
 
 All the data to train and run inference with ProtNote can is available in the data.zip file (17.6 GB) that can be downloaded from Zenodo using the following command *from the protnote root folder*:
 
-```
+```sh
 sudo apt-get install unzip
 curl -O https://zenodo.org/records/13897920/files/data.zip?download=1
 unzip data.zip
@@ -119,8 +123,10 @@ This is a pickle file storing a pandas dataframe with the annotations and their
 
 To seamlessly create the annotations file for GO annotations or EC numbers, we provide the `download_GO_annotations.py` and `download_EC_annotations.py` scripts. To get the GO annotations run:
 
-```
-python bin/download_GO_annotations.py --url {GO_ANNOTATIONS_RELEASE_URL} --output-file {OUTPUT_FILE_NAME}
+```sh
+python bin/download_GO_annotations.py \
+    --url {GO_ANNOTATIONS_RELEASE_URL} \
+    --output-file {OUTPUT_FILE_NAME}
 ```
 
 Where `{GO_ANNOTATIONS_RELEASE_URL}` is a specific GO release (e.g., https://release.geneontology.org/2024-06-17/ontology/go.obo) and `{OUTPUT_FILE_NAME}` is the name of the annotations file that will be stored in data/annotations/ (e.g., `go_annotations_jul_2024.pkl`).
@@ -136,8 +142,11 @@ For each sequence, ProtNote computes the likelihood that it is annotated with an
 
 To generate the embeddings that we used to train ProtNote, execute the following code:
 
-```
-python bin/generate_label_embeddings.py --base-label-embedding-path {EMBEDDING_PATH_CONFIG_KEY} --annotations-path-name {ANNOTATIONS_PATH_CONFIG_KEY} --add-instruction --account-for-sos
+```sh
+python bin/generate_label_embeddings.py \
+    --base-label-embedding-path {EMBEDDING_PATH_CONFIG_KEY} \
+    --annotations-path-name {ANNOTATIONS_PATH_CONFIG_KEY} \
+    --add-instruction --account-for-sos
 ```
 
 * `{EMBEDDING_PATH_CONFIG_KEY}`: should be a key from the config that specifies the "base" path name where the embeddings will be stored. It's called "base" because `{EMBEDDING_PATH_CONFIG_KEY}` will be modified based on some of the arguments passed to the script, such as the pooling method.
@@ -169,8 +178,16 @@ The test set is specified via the `--test-paths-names` argument, and the argumen
 ### Inference
 To run inference, simply run:
 
-```
-python bin/main.py --test-paths-names {YOUR_TEST_SET_CONFIG_KEY} --model-file {MODEL_WEIGHTS_FILE}  --name {MODEL_RUN_NAME} --base-label-embedding-name {EMBEDDING_PATH_CONFIG_KEY} --annotations-path-name {ANNOTATIONS_PATH_CONFIG_KEY} --save-prediction-results --save-val-test-metrics --save-val-test-metrics-file {OUT_METRICS_FILE}
+```sh
+python bin/main.py \
+    --test-paths-names {YOUR_TEST_SET_CONFIG_KEY} \
+    --model-file {MODEL_WEIGHTS_FILE} \
+    --name {MODEL_RUN_NAME} \
+    --base-label-embedding-name {EMBEDDING_PATH_CONFIG_KEY} \
+    --annotations-path-name {ANNOTATIONS_PATH_CONFIG_KEY} \
+    --save-prediction-results \
+    --save-val-test-metrics \
+    --save-val-test-metrics-file {OUT_METRICS_FILE}
 ```
 
 * `{YOUR_TEST_SET_CONFIG_KEY}`: the name of the test set path in the `base_config.yaml` (e.g., `TEST_DATA_PATH`).
@@ -203,13 +220,13 @@ The following two sections explain how to get the data to run the notebook succe
 
 Download the data, ablation_models, and outputs folders. *Make sure you are in the root repo directory. The data, ablation_models, and outputs folders are 17.6, 41.4, and 18.7 GB in size, respectively*.
 
-```
+```sh
 sudo apt-get install unzip # Install unzip if necessary
-#Download files from zenodo
+# Download files from zenodo
 curl -O https://zenodo.org/records/13897920/files/data.zip?download=1
 curl -O https://zenodo.org/records/13897920/files/ablation_models.zip?download=1
 curl -O https://zenodo.org/records/13897920/files/outputs.zip?download=1
-#Unzip files in the correct directories
+# Unzip files in the correct directories
 unzip data.zip
 unzip outputs.zip
 unzip -j ablation_models.zip -d data/models/ProtNote/ 
@@ -227,27 +244,27 @@ Perform the following stepts to download the original ProteInfer dataset TFRecor
 
 Install gcloud:
 
-```
+```sh
 sudo snap install google-cloud-cli --classic
 ```
 
 Then, login with a google account (e.g., gmail). The following command will prompt a browser window for authentication:
 
-```
+```sh
 gcloud init
 ```
 
 Download the data:
 
-```
+```sh
 gsutil -m cp -r gs://brain-genomics-public/research/proteins/proteinfer/datasets/swissprot .
 ```
 
 Move the `random` and `clustered` folders to the directory data/swissprot/proteinfer_splits/.
 
 To create the fasta versions of these files run the following commands from root:
 
-```
+```sh
 python bin/make_proteinfer_dataset.py --dataset-type random --annotation-types GO
 python bin/make_proteinfer_dataset.py --dataset-type random --annotation-types EC
 cp data/swissprot/proteinfer_splits/random/test_EC.fasta data/zero_shot/
@@ -256,10 +273,17 @@ cp data/swissprot/proteinfer_splits/random/test_EC.fasta data/zero_shot/
 #### Download annotations
 Download GO annotations and EC numbers.
 
-```
-python bin/download_GO_annotations.py --url https://release.geneontology.org/2019-07-01/ontology/go.obo --output-file go_annotations_2019_07_01.pkl
-python bin/download_GO_annotations.py --url https://release.geneontology.org/2024-06-17/ontology/go.obo --output-file go_annotations_jul_2024.pkl
-python bin/update_go_annotations.py --old-annotations-file-path data/annotations/go_annotations_2019_07_01.pkl --new-annotations-file-path data/annotations/go_annotations_jul_2024.pkl --output-file-path data/annotations/go_annotations_2019_07_01_updated.pkl
+```sh
+python bin/download_GO_annotations.py \
+    --url https://release.geneontology.org/2019-07-01/ontology/go.obo \
+    --output-file go_annotations_2019_07_01.pkl
+python bin/download_GO_annotations.py \
+    --url https://release.geneontology.org/2024-06-17/ontology/go.obo \
+    --output-file go_annotations_jul_2024.pkl
+python bin/update_go_annotations.py \
+    --old-annotations-file-path data/annotations/go_annotations_2019_07_01.pkl \
+    --new-annotations-file-path data/annotations/go_annotations_jul_2024.pkl \
+    --output-file-path data/annotations/go_annotations_2019_07_01_updated.pkl
 python bin/download_EC_annotations.py
 ```
 
@@ -273,15 +297,34 @@ Run ```python bin/create_test_sets.py``` to create all the remaining datasets us
 
 We cached the text embeddings of annotation text descriptions under five scenarios: GO_2019 annotations with BioGPT, GO_2019 annotations with E5, EC numbers with BioGPT, EC numbers with E5, and GO_2024 annotations with E5. The following code generates the embeddings for these scenarios:
 
-```
-python bin/generate_label_embeddings.py --add-instruction --account-for-sos
-python bin/generate_label_embeddings.py --label-encoder-checkpoint microsoft/biogpt --account-for-sos
+```sh
+python bin/generate_label_embeddings.py \
+    --add-instruction --account-for-sos
+python bin/generate_label_embeddings.py \
+    --label-encoder-checkpoint microsoft/biogpt \
+    --account-for-sos
 
-python bin/generate_label_embeddings.py --base-label-embedding-path EC_BASE_LABEL_EMBEDDING_PATH --annotations-path-name EC_ANNOTATIONS_PATH --label-encoder-checkpoint microsoft/biogpt --account-for-sos
-python bin/generate_label_embeddings.py --base-label-embedding-path EC_BASE_LABEL_EMBEDDING_PATH --annotations-path-name EC_ANNOTATIONS_PATH --add-instruction --account-for-sos
+python bin/generate_label_embeddings.py \
+    --base-label-embedding-path EC_BASE_LABEL_EMBEDDING_PATH \
+    --annotations-path-name EC_ANNOTATIONS_PATH \
+    --label-encoder-checkpoint microsoft/biogpt \
+    --account-for-sos
+python bin/generate_label_embeddings.py \
+    --base-label-embedding-path EC_BASE_LABEL_EMBEDDING_PATH \
+    --annotations-path-name EC_ANNOTATIONS_PATH \
+    --add-instruction \
+    --account-for-sos
 
-python bin/generate_label_embeddings.py --base-label-embedding-path GO_2024_BASE_LABEL_EMBEDDING_PATH --annotations-path-name GO_ANNOTATIONS_PATH --add-instruction --account-for-sos
-python bin/generate_label_embeddings.py --base-label-embedding-path GO_2024_BASE_LABEL_EMBEDDING_PATH --annotations-path-name GO_ANNOTATIONS_PATH --label-encoder-checkpoint microsoft/biogpt --account-for-sos
+python bin/generate_label_embeddings.py \
+    --base-label-embedding-path GO_2024_BASE_LABEL_EMBEDDING_PATH \
+    --annotations-path-name GO_ANNOTATIONS_PATH \
+    --add-instruction \
+    --account-for-sos
+python bin/generate_label_embeddings.py \
+    --base-label-embedding-path GO_2024_BASE_LABEL_EMBEDDING_PATH \
+    --annotations-path-name GO_ANNOTATIONS_PATH \
+    --label-encoder-checkpoint microsoft/biogpt \
+    --account-for-sos
 
 ```
 
@@ -296,7 +339,7 @@ Below are examples of two ProteInfer models. One is for GO annotation prediction
 To download and get the predictions for the five ProteInfer seeds used in the paper, we need to clone ProteInfer's repo from `https://github.com/google-research/proteinfer.git` and create a ProteInfer conda environment. Make sure you are inside the `protnote` repo before running the following commands:
 
 
-```
+```sh
 conda env create -f proteinfer_conda_requirements.yml
 git clone https://github.com/google-research/proteinfer.git ../proteinfer
 conda activate proteinfer
@@ -309,14 +352,15 @@ python bin/test_proteinfer.py --test-paths-names TEST_2024_PINF_VOCAB_DATA_PATH
 
 To generate all the predictions shown in the Results notebook, you will need ProtNote's weights for the five different seeds we used, available in Zenodo (https://zenodo.org/records/13897920/files/data.zip in the directory data/models/ProtNote/). Dump the downloaded models in the following directory: protnote/data/models/ProtNote; or directly train the models. Then, run the following commands:
 
-```
+```sh
 python bin/test_models.py --model-files \
     seed_replicates_v9_12_sum_last_epoch.pt \
     seed_replicates_v9_22_sum_last_epoch.pt \
     seed_replicates_v9_32_sum_last_epoch.pt \
     seed_replicates_v9_42_sum_last_epoch.pt \
     seed_replicates_v9_52_sum_last_epoch.pt \
-    --test-paths-names "TEST_DATA_PATH_ZERO_SHOT_LEAF_NODES" "TEST_DATA_PATH_ZERO_SHOT" "TEST_EC_DATA_PATH_ZERO_SHOT" "TEST_DATA_PATH" --save-prediction-results
+    --test-paths-names "TEST_DATA_PATH_ZERO_SHOT_LEAF_NODES" "TEST_DATA_PATH_ZERO_SHOT" "TEST_EC_DATA_PATH_ZERO_SHOT" "TEST_DATA_PATH" \
+    --save-prediction-results
 
 python bin/test_models.py --model-files \
     seed_replicates_v9_42_sum_last_epoch.pt \
@@ -337,21 +381,25 @@ python bin/test_models.py --model-files \
 
 To get BLAST-based predictions on the supervised setting, use the following code:
 
-```
-python bin/run_blast.py --test-data-path data/swissprot/proteinfer_splits/random/test_GO.fasta --train-data-path data/swissprot/proteinfer_splits/random/train_GO.fasta
+```sh
+python bin/run_blast.py \
+    --test-data-path data/swissprot/proteinfer_splits/random/test_GO.fasta \
+    --train-data-path data/swissprot/proteinfer_splits/random/train_GO.fasta
 ```
 
 The same command can be used to run the BLAST-based inference on subset query sets of different sizes. The runtime logged in the terminal was used in the runtime comparison figure of the paper in the Supplementary Information. The file names of the query subsets of different sizes follow the pattern: `test_*_GO.fasta` where `*` refers to the number of sequences. For example, to log the runtime in the terminal for 1 query sequence run:
 
-```
-python bin/run_blast.py --test-data-path data/swissprot/proteinfer_splits/random/test_1_GO.fasta --train-data-path data/swissprot/proteinfer_splits/random/train_GO.fasta
+```sh
+python bin/run_blast.py \
+    --test-data-path data/swissprot/proteinfer_splits/random/test_1_GO.fasta \
+    --train-data-path data/swissprot/proteinfer_splits/random/train_GO.fasta
 ```
 
 #### Calculate supervised metrics
 
 To calculate the supervised metrics for all models using the previously generated prediction files, run the following command:
 
-```
+```sh
 python bin/calculate_supervised_metrics.py 
 ```
 
@@ -362,8 +410,10 @@ This script calculates the mAP Macro and mAP Micro metrics for ProtNote and Prot
 ### Latest SwissProt data
 To download the **latest** SwissProt file run:
 
-```
-python bin/download_swissprot.py --url "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz" --output-file "uniprot_sprot_latest.dat"
+```sh
+python bin/download_swissprot.py \
+    --url "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz" \
+    --output-file "uniprot_sprot_latest.dat"
 ```
 
 In general, we recommend downloading the SwissProt data directly from their [release archive](https://ftp.ebi.ac.uk/pub/databases/uniprot/previous_releases/) for reproducibility.

diff --git a/.amltignore → misc/.amltignore b/.amltignore → misc/.amltignore
diff --git a/amlt_config.yml → misc/amlt_config.yml b/amlt_config.yml → misc/amlt_config.yml
diff --git a/amlt_config_ablations.yml → misc/amlt_config_ablations.yml b/amlt_config_ablations.yml → misc/amlt_config_ablations.yml
diff --git a/environment.yml → misc/environment.yml b/environment.yml → misc/environment.yml
diff --git a/hyperdrive_seed_replicates.yml → misc/hyperdrive_seed_replicates.yml b/hyperdrive_seed_replicates.yml → misc/hyperdrive_seed_replicates.yml
diff --git a/notebooks/GOAnnotationsPerYear.ipynb → misc/notebooks/GOAnnotationsPerYear.ipynb b/notebooks/GOAnnotationsPerYear.ipynb → misc/notebooks/GOAnnotationsPerYear.ipynb
diff --git a/notebooks/Gene Ontology Data Archive.html → ...notebooks/Gene Ontology Data Archive.html b/notebooks/Gene Ontology Data Archive.html → ...notebooks/Gene Ontology Data Archive.html
diff --git a/notebooks/Results.ipynb → misc/notebooks/Results.ipynb b/notebooks/Results.ipynb → misc/notebooks/Results.ipynb
diff --git a/proteinfer_conda_requirements.yml → misc/proteinfer_conda_requirements.yml b/proteinfer_conda_requirements.yml → misc/proteinfer_conda_requirements.yml
diff --git a/misc/pyproject.toml.legacy b/misc/pyproject.toml.legacy
@@ -0,0 +1,43 @@
+[tool.ruff]
+line-length = 140
+ignore = ["F841", "F401", "E712"]
+[tool.pixi.workspace]
+name = "protnote_fork"
+channels = ["conda-forge"]
+platforms = ["linux-64"]
+
+[tool.pixi.pypi-dependencies]
+protnote_fork = { path = ".", editable = true }
+
+[tool.pixi.tasks]
+
+[tool.pixi.feature.protnote]
+channels = ["biobuilds", "bioconda", "pytorch", "huggingface", "nvidia", "anaconda", "conda-forge", "main", "r", "msys2"]
+
+[tool.pixi.feature.protnote.dependencies]
+python = "3.10.13.*"
+pytorch = "2.0.1.*"
+torchvision = "0.15.2.*"
+pytorch-cuda = "11.8.*"
+pandas = "1.5.2.*"
+joblib = "1.1.1.*"
+transformers = "4.32.1.*"
+torchmetrics = "1.2.0.*"
+torchdata = "0.7.1.*"
+wandb = "0.15.11.*"
+sacremoses = "0.0.53.*"
+pynvml = "11.5.0.*"
+protobuf = "3.20.3.*"
+scipy = "1.13.1.*"
+seaborn = "0.13.2.*"
+scikit-learn = "1.3.0.*"
+matplotlib = "3.9.2.*"
+umap-learn = "0.5.4.*"
+blast = "2.12.0.*"
+openpyxl = "3.1.5.*"
+
+[tool.pixi.environments]
+protnote = { features = ["protnote"], no-default-feature = true }
+
+[dependency-groups]
+protnote = ["pip", "torcheval==0.0.7", "wget==3.2", "azureml-mlflow==1.53.0", "loralib==0.1.2", "tensorboard==2.15.1", "obonet==1.0.0", "blosum==2.0.2", "biopython==1.84", "ipykernel==6.29.5"]
diff --git a/setup.py → misc/setup.py.legacy b/setup.py → misc/setup.py.legacy
diff --git a/singularity_image/Dockerfile → misc/singularity_image/Dockerfile b/singularity_image/Dockerfile → misc/singularity_image/Dockerfile
diff --git a/singularity_image/build_image.sh → misc/singularity_image/build_image.sh b/singularity_image/build_image.sh → misc/singularity_image/build_image.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SCM syntax highlighting & preventing 3-way merges
		pixi.lock merge=binary linguist-language=YAML linguist-generated=true -diff
-Original file line number
+Diff line change
@@ Expand Up / @@ -104,4 +104,6 @@ target/ @@
     # runs
     /runs/
-    ```
+    # pixi environments
+    .pixi/*
+    !.pixi/config.toml