Skip to content

Commit 4b7cfe6

Browse files
authored
Merge pull request #98 from IITH-Compilers/inMemoryVocabulary
created file to generated vocabulary as map
2 parents 3e2f3af + c8dc365 commit 4b7cfe6

File tree

18 files changed

+124
-129
lines changed

18 files changed

+124
-129
lines changed

Manylinux2014_Compliant_Source/pkg/build.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ cd ..
1818
cmake -DCMAKE_BUILD_TYPE=Release .. && make -j"$(nproc)" && make install
1919

2020
cd ..
21+
cp build/vocabulary.h Manylinux2014_Compliant_Source/pkg/ir2vec/
2122
cp src/include/utils.h Manylinux2014_Compliant_Source/pkg/ir2vec/
2223
cp build/src/version.h Manylinux2014_Compliant_Source/pkg/ir2vec/
23-
cp vocabulary/seedEmbeddingVocab.txt Manylinux2014_Compliant_Source/pkg/ir2vec/
24-
2524
bash Manylinux2014_Compliant_Source/pkg/regen-oracle.sh

Manylinux2014_Compliant_Source/pkg/ir2vec/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,3 @@
1414
__version__ = getVersion()
1515
__copyright__ = "Copyright The Contributors of IR2Vec"
1616
__license__ = "BSD 4-Clause License"
17-
18-
setSeedEmbdPath(preparation.install_loc_pkg)

Manylinux2014_Compliant_Source/pkg/ir2vec/core.cpp

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#include "IR2Vec.h"
1010
#include "utils.h"
1111
#include "version.h"
12-
1312
#include <Python.h>
1413
#include <cstring>
1514
#include <fstream>
@@ -58,14 +57,6 @@ static PyObject *getIR2VecVersion(PyObject *self, PyObject *args) {
5857
NULL);
5958
}
6059

61-
PyObject *setSeedEmbeddingPath(PyObject *self, PyObject *args) {
62-
const char *vocab_path2 = "";
63-
if (PyArg_ParseTuple(args, "s", &vocab_path2)) {
64-
seed_emb_path = string(vocab_path2);
65-
}
66-
return PyUnicode_FromString("Seed Embedding Path is Set");
67-
}
68-
6960
bool fileNotValid(const char *filename) {
7061
ifstream temp;
7162
temp.open(filename, ios_base::in);
@@ -169,21 +160,18 @@ class IR2VecHandler {
169160
// The scope of this Module object is extremely crucial
170161
std::unique_ptr<llvm::Module> Module;
171162
Module = IR2Vec::getLLVMIR();
172-
std::string vocab_path = seed_emb_path + "/seedEmbeddingVocab.txt";
173163

174164
IR2Vec::Embeddings *emb = new IR2Vec::Embeddings();
175165
// if output file is provided
176166
if (this->outputFile != "") {
177167
string outFile = this->outputFile;
178168
ofstream output;
179169
output.open(outFile, ios_base::app);
180-
emb = std::move(new IR2Vec::Embeddings(*Module, ir2vecMode, vocab_path,
181-
(this->level)[0], &output,
182-
funcName));
170+
emb = std::move(new IR2Vec::Embeddings(
171+
*Module, ir2vecMode, (this->level)[0], &output, funcName));
183172
} else {
184-
emb = std::move(new IR2Vec::Embeddings(*Module, ir2vecMode, vocab_path,
185-
(this->level)[0], nullptr,
186-
funcName));
173+
emb = std::move(new IR2Vec::Embeddings(
174+
*Module, ir2vecMode, (this->level)[0], nullptr, funcName));
187175
}
188176

189177
if (!emb) {
@@ -381,8 +369,6 @@ PyMethodDef IR2Vec_core_Methods[] = {
381369
"Get Program Vector"},
382370
{"getFunctionVectors", (PyCFunction)getFunctionVectors, METH_VARARGS,
383371
"Get Function Vectors"},
384-
{"setSeedEmbdPath", (PyCFunction)setSeedEmbeddingPath, METH_VARARGS,
385-
"Set Seed Embedding Path"},
386372
{"getVersion", getIR2VecVersion, METH_VARARGS, "Get IR2Vec Version"},
387373
{NULL, NULL, 0, NULL} /* Sentinel */
388374
};

Manylinux2014_Compliant_Source/pkg/regen-oracle.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,12 @@ mkdir -p ${DEST_FOLDER_SYM_P}
2828
mkdir -p ${DEST_FOLDER_FA_P}
2929

3030
IR2VEC_PATH=../../build/bin/ir2vec
31-
VOCAB_PATH="../../vocabulary/seedEmbeddingVocab.txt"
3231

3332
while IFS= read -r d; do
3433
echo "Generating embeddings for ${d}"
35-
${IR2VEC_PATH} -sym -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_SYM}/ir2vec.txt -level f ${d} &>/dev/null
36-
${IR2VEC_PATH} -fa -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_FA}/ir2vec.txt -level f ${d} &>/dev/null
37-
${IR2VEC_PATH} -sym -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_SYM_P}/ir2vec.txt -level p ${d} >/dev/null
38-
${IR2VEC_PATH} -fa -vocab=${VOCAB_PATH} -o ${DEST_FOLDER_FA_P}/ir2vec.txt -level p ${d} >/dev/null
34+
${IR2VEC_PATH} -sym -o ${DEST_FOLDER_SYM}/ir2vec.txt -level f ${d} &>/dev/null
35+
${IR2VEC_PATH} -fa -o ${DEST_FOLDER_FA}/ir2vec.txt -level f ${d} &>/dev/null
36+
${IR2VEC_PATH} -sym -o ${DEST_FOLDER_SYM_P}/ir2vec.txt -level p ${d} >/dev/null
37+
${IR2VEC_PATH} -fa -o ${DEST_FOLDER_FA_P}/ir2vec.txt -level p ${d} >/dev/null
3938
done <index-${SEED_VERSION}.files
4039
wait

Manylinux2014_Compliant_Source/pkg/setup.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,5 @@ def get_llvm_files():
8181
],
8282
ext_modules=[IR2Vec_core],
8383
packages=["ir2vec"],
84-
package_data={"": ["seedEmbeddingVocab.txt"]},
8584
include_package_data=True,
8685
)

README.md

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,13 @@ To ensure the correctness, run `make verify-all`
110110
instructions.
111111

112112
### Using Binary
113-
> ir2vec -\<mode\> -vocab \<seedEmbedding-file-path\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
113+
> ir2vec -\<mode\> -o \<output-file\> -level \<p|f\> -class \<class-number\> -funcName=\<function-name\> \<input-ll-file\>
114114
115115
#### Command-Line options
116116

117117
- `mode` - can be one of `sym`/`fa`
118118
- `sym` denotes Symbolic representation
119119
- `fa` denotes Flow-Aware representation
120-
- `vocab` - the path to the seed embeddings file
121120
- `o` - file in which the embeddings are to be appended; (Note : If file doesn’t exist, new file would be created, else embeddings would be appended)
122121
- `level` - can be one of chars `p`/`f`.
123122
- `p` denotes `program level` encoding
@@ -139,16 +138,16 @@ Please use `--help` for further details.
139138
140139
#### Flow-Aware Embeddings
141140
For all functions
142-
* `` ir2vec -fa -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
141+
* `` ir2vec -fa -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
143142

144143
For a specific function
145-
* `` ir2vec -fa -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``
144+
* `` ir2vec -fa -o <output_file> -level f -class <class-number> -funcName=\<function-name\><input_ll_file>``
146145

147146
#### Symbolic Embeddings
148147
For all functions
149-
* `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
148+
* `` ir2vec -sym -o <output_file> -level <p|f> -class <class-number> <input_ll_file>``
150149
For a specific function
151-
* `` ir2vec -sym -vocab vocabulary/seedEmbeddingVocab.txt -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
150+
* `` ir2vec -sym -o <output_file> -level f -class <class-number> -funcName=\<function-name\> <input_ll_file>``
152151

153152
## Using Libraries
154153
The libraries can be installed by passing the installation location to the `CMAKE_INSTALL_PREFIX` flag during `cmake` followed by `make install`.
@@ -176,8 +175,7 @@ The following example snippet shows how to query the exposed vector representati
176175

177176
// Creating object to generate FlowAware representation
178177
auto ir2vec =
179-
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware,
180-
"./vocabulary/seedEmbeddingVocab.txt");
178+
IR2Vec::Embeddings(<LLVM Module>, IR2Vec::IR2VecMode::FlowAware);
181179

182180
// Getting Instruction vectors corresponding to the instructions in <LLVM Module>
183181
auto instVecMap = ir2vec.getInstVecMap();

experiments/generate_IR2Vec_embeddings.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ fi
1919

2020
BUILD=$(realpath ${BUILD})
2121

22-
Absolute_path_of_RepresentationFile=$(realpath ../vocabulary/seedEmbeddingVocab.txt)
23-
2422
TASK_DIR=$1
2523
if [ -z "${TASK_DIR}" ]; then
2624
echo "Task is not mentioned. Please enter value of DM for Device Mapping or TC for Thread_Coarsening."
@@ -88,7 +86,7 @@ ulimit -s unlimited
8886
for d in ./*.ll; do
8987
let "a++"
9088
echo "$a $d" >>${ALL_FILE}
91-
${BUILD}/bin/ir2vec -${PASS} -vocab $Absolute_path_of_RepresentationFile -class $a -o res_$Trans_type.txt -level p $WEIGHTS $d &>/dev/null
89+
${BUILD}/bin/ir2vec -${PASS} -class $a -o res_$Trans_type.txt -level p $WEIGHTS $d &>/dev/null
9290
done
9391
cd ../..
9492

src/CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11

22
configure_file (./include/version.h.cmake version.h @ONLY)
33
include_directories(./include ${CMAKE_CURRENT_BINARY_DIR})
4-
4+
include_directories(${CMAKE_BINARY_DIR})
55
set(commonsrc FlowAware.cpp Symbolic.cpp utils.cpp)
66
set(libsrc libIR2Vec.cpp ${commonsrc})
77
set(binsrc CollectIR.cpp IR2Vec.cpp)
8-
9-
file(GLOB RESOURCE_FILES ../vocabulary/seedEmbeddingVocab.txt)
10-
118
option(LLVM_IR2VEC "where to enable IR2Vec as subproject for LLVM" OFF)
12-
9+
execute_process(
10+
COMMAND python3 generate_vocabulary.py -o ${CMAKE_BINARY_DIR}/vocabulary.h
11+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
12+
COMMAND echo "Vocabulary file generated."
13+
)
1314
if(NOT LLVM_IR2VEC)
1415

1516
set(LT_LLVM_INSTALL_DIR "" CACHE PATH "LLVM installation directory")
@@ -37,7 +38,6 @@ if(NOT LLVM_IR2VEC)
3738
VERSION ${PROJECT_VERSION}
3839
SOVERSION 1
3940
PUBLIC_HEADER "./include/IR2Vec.h"
40-
RESOURCE ${RESOURCE_FILES}
4141
OUTPUT_NAME ${IR2VEC_LIB}
4242
LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
4343
ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib

src/IR2Vec.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@ cl::opt<bool> cl_collectIR(
3232
"collectIR", cl::Optional,
3333
cl::desc("Generate triplets for training seed embedding vocabulary"),
3434
cl::init(false), cl::cat(category));
35-
36-
cl::opt<std::string> cl_vocab("vocab", cl::Optional, cl::init(""),
37-
cl::desc("Use embeddings from file path"),
38-
cl::cat(category));
39-
4035
cl::opt<std::string> cl_iname(cl::Positional, cl::desc("Input file path"),
4136
cl::Required, cl::cat(category));
4237

@@ -81,7 +76,6 @@ int main(int argc, char **argv) {
8176
fa = cl_fa;
8277
sym = cl_sym;
8378
collectIR = cl_collectIR;
84-
vocab = cl_vocab;
8579
iname = cl_iname;
8680
oname = cl_oname;
8781
// newly added
@@ -105,18 +99,12 @@ int main(int argc, char **argv) {
10599
errs() << "Invalid level specified: Use either p or f\n";
106100
failed = true;
107101
}
108-
if (vocab.empty()) {
109-
errs() << "Should specify vocab pointing to the path of vocabulary\n";
110-
failed = true;
111-
}
112102
} else {
113103
if (!collectIR) {
114104
errs() << "Either of sym, fa or collectIR should be specified\n";
115105
failed = true;
116106
} else if (level)
117107
errs() << "[WARNING] level would not be used in collectIR mode\n";
118-
else if (!vocab.empty())
119-
errs() << "[WARNING] vocab would not be used in collectIR mode\n";
120108
}
121109

122110
if (failed)

src/generate_vocabulary.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Copyright (c) 2024, The Contributors of IR2Vec.
2+
#
3+
# Part of the IR2Vec project. This software is available under the BSD 4-Clause
4+
# License. Please see LICENSE file in the top-level directory for more details.
5+
#
6+
import argparse
7+
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument("-o", "--output", type=str, help="Output file name")
10+
args = parser.parse_args()
11+
12+
if args.output is None:
13+
print("Error: Output file path not provided.")
14+
exit(1)
15+
16+
output_file = args.output
17+
vocab_file = "../vocabulary/seedEmbeddingVocab.txt"
18+
19+
# Define headers and opening/closing of map
20+
header = """\
21+
// Generated by IR2Vec. DO NOT EDIT!
22+
// This file contains the learned vocabulary used by IR2Vec.
23+
//
24+
// clang-format off
25+
26+
#ifndef __VOCABULARY__
27+
#define __VOCABULARY__
28+
29+
#include <map>
30+
#include <string>
31+
#include <vector>
32+
#include "IR2Vec.h"
33+
34+
namespace IR2Vec {
35+
36+
class Vocabulary {
37+
public:
38+
static const std::map<std::string, IR2Vec::Vector>& getVocabulary() {
39+
return vocabulary;
40+
}
41+
private:
42+
static const std::map<std::string, IR2Vec::Vector> vocabulary;
43+
};
44+
45+
"""
46+
47+
opening = "\nconst std::map<std::string, IR2Vec::Vector> Vocabulary::vocabulary = {\n"
48+
closing = """\
49+
};
50+
} // namespace IR2Vec
51+
52+
#endif // __VOCABULARY__
53+
"""
54+
55+
try:
56+
with open(output_file, "w") as fw:
57+
fw.write(header)
58+
with open(vocab_file, "r") as fr:
59+
# Write vector declarations to the output file
60+
for line in fr.readlines():
61+
key, val = line.strip().split(":")
62+
e = val.find("]")
63+
fw.write(f"const IR2Vec::Vector {key}_vector = {{ {val[1:e]} }};\n")
64+
65+
fw.write(opening)
66+
with open(vocab_file, "r") as fr:
67+
# Write map entries to the output file
68+
for line in fr.readlines():
69+
key, _ = line.strip().split(":")
70+
fw.write(f' {{ "{key}", {key}_vector }},\n')
71+
fw.write(closing)
72+
73+
print(f"Generated {output_file}")
74+
75+
except FileNotFoundError:
76+
print(f"Error: Vocabulary file '{vocab_file}' not found.")

0 commit comments

Comments
 (0)