machinelearning/src/Microsoft.ML.GenAI.LLaMA/LlamaForCausalLM.cs at 62ffa45310cf09ab682935cec27ae0dc36e22415 · dotnet/machinelearning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Diagnostics;
using System.Text.Json;
using Microsoft.ML.GenAI.Core;
using Microsoft.ML.GenAI.Core.Extension;
using Microsoft.ML.GenAI.LLaMA.Module;
using TorchSharp;
using TorchSharp.Modules;
using TorchSharp.PyBridge;
using static TorchSharp.torch;

namespace Microsoft.ML.GenAI.LLaMA;

public class LlamaForCausalLM : nn.Module<CausalLMModelInput, CausalLMModelOutput>
{
    private readonly LlamaConfig _config;
    private readonly int _vocabSize;

#pragma warning disable MSML_PrivateFieldName // Private field name not in: _camelCase format
    private readonly Linear lm_head;
    private readonly LlamaModel model;
#pragma warning restore MSML_PrivateFieldName // Private field name not in: _camelCase format

    public LlamaForCausalLM(LlamaConfig config, string? device = null)
        : base(nameof(LlamaForCausalLM))
    {
        _config = config;
        _vocabSize = config.VocabSize;

        model = new LlamaModel(config, device);

        // When tie word embeddings is true, the lm_head shares the same weight as the embedding layer.
        // therefore, the lm_head weight won't be initialized here.
        // instead, it will be loaded from the embedding layer after the model is loaded.
        if (config.TieWordEmbeddings)
        {
            this.RegisterComponents();
            lm_head = nn.Linear(config.HiddenSize, config.VocabSize, hasBias: false, dtype: config.DType);
        }
        else
        {
            lm_head = nn.Linear(config.HiddenSize, config.VocabSize, hasBias: false, dtype: config.DType);
            this.RegisterComponents();
        }

    }

    private void TieWordEmbeddings()
    {
        var embeddingWeight = model.Embedding.state_dict();
        this.lm_head.load_state_dict(embeddingWeight);

        this.lm_head.to(device: model.Embedding.weight!.device);
    }

#pragma warning disable MSML_GeneralName // This name should be PascalCased
    public override CausalLMModelOutput forward(CausalLMModelInput input)
#pragma warning restore MSML_GeneralName // This name should be PascalCased
    {
        var outputs = this.model.forward(input);
        var logits = this.lm_head.forward(outputs.LastHiddenState);
        logits = logits.to_type(ScalarType.Float32);
        outputs.Logits = logits;

        // calculate the loss if the label is provided
        if (input.Labels is not null)
        {
            // upcast the logits to float32
            logits = logits.to_type(ScalarType.Float32);

            var shiftLogits = logits[.., .., ..].contiguous();
            var shiftLabels = input.Labels[.., ..].contiguous();

            shiftLogits = shiftLogits.view(-1, _vocabSize);
            shiftLabels = shiftLabels.view(-1);

            // calculate the loss
            // the loss is calculated by using the cross entropy loss by default
            // TODO: add support for other loss functions
            var loss = nn.functional.cross_entropy(shiftLogits, shiftLabels);
            outputs.Loss = loss;

            // dispose the shiftLogits
            shiftLogits.Dispose();
            shiftLabels.Dispose();
            logits.Dispose();
        }

        return outputs;
    }

    public static LlamaForCausalLM FromPretrained(
        string modelFolder,
        string configName = "config.json",
        string checkPointName = "model.safetensors.index.json",
        ScalarType torchDtype = ScalarType.BFloat16,
        string device = "cpu")
    {
        var config = Path.Join(modelFolder, configName);
        var modelConfig = JsonSerializer.Deserialize<LlamaConfig>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
        modelConfig.DType = torchDtype;
        var model = new LlamaForCausalLM(modelConfig);

        model.LoadSafeTensors(modelFolder, checkPointName);
        model = model.to(device);
        if (modelConfig.TieWordEmbeddings)
        {
            model.TieWordEmbeddings();
        }


        return model;
    }

    public static LlamaForCausalLM FromPretrained(
        string modelFolder,
        string configName = "config.json",
        string checkPointName = "model.safetensors.index.json",
        bool quantizeToInt8 = false,
        bool quantizeTo4Bit = false,
        int layersOnTargetDevice = -1,
        ScalarType torchDtype = ScalarType.BFloat16,
        string targetDevice = "cuda")
    {
        if (layersOnTargetDevice == -1 && quantizeTo4Bit == false && quantizeToInt8 == false)
        {
            return FromPretrained(modelFolder, configName, checkPointName, torchDtype, targetDevice);
        }

        var originalDefaultDevice = torch.get_default_device();
        torch.set_default_device("meta");
        var config = Path.Join(modelFolder, configName);
        var modelConfig = JsonSerializer.Deserialize<LlamaConfig>(File.ReadAllText(config)) ?? throw new ArgumentNullException(nameof(config));
        modelConfig.DType = torchDtype;
        var model = new LlamaForCausalLM(modelConfig);

        if (quantizeToInt8)
        {
            model.ToInt8QuantizeModule();
        }
        else if (quantizeTo4Bit)
        {
            model.ToQuantize4BitModule();
        }

        var deviceMap = model.InferDeviceMapForEachLayer(
            [
                KeyValuePair.Create(targetDevice, layersOnTargetDevice),
                KeyValuePair.Create("cpu", -1)
            ]);

        torch.set_default_device("cpu");
        model = new LlamaForCausalLM(modelConfig);

        model.LoadSafeTensors(modelFolder, checkPointName);

        if (quantizeToInt8)
        {
            model.ToInt8QuantizeModule();
        }
        else if (quantizeTo4Bit)
        {
            model.ToQuantize4BitModule();
        }

        model = model.ToDynamicLoadingModel(deviceMap, targetDevice);

        if (modelConfig.TieWordEmbeddings)
        {
            model.TieWordEmbeddings();
        }

        torch.set_default_device(originalDefaultDevice);

        return model;
    }

    public void LoadSafeTensors(string modelFolder, string checkPointName = "model.safetensors.index.json")
    {
        this.load_checkpoint(path: modelFolder, checkpointName: checkPointName, strict: true, useTqdm: false);
    }
}