Skip to content

Commit a9ab223

Browse files
committed
Merge branch 'compat_sparse_table_file' into 'master'
Compat sparse table file See merge request deep-learning/tensornet!7
2 parents 7e432a1 + 9865c9c commit a9ab223

23 files changed

+712
-21
lines changed

.gitlab-ci.yml

+2-3
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@ tn_build:
1919
script:
2020
- sed -i "s|https://github.com|${NEXUS3_HEADER}/github.com|g" WORKSPACE
2121
- ./manager build
22-
- ./manager deploy
22+
- ./manager copy-libs
23+
- ./manager test
2324
cache:
2425
- key: cache-$CI_COMMIT_REF_NAME
2526
paths:
2627
- /root/.cache/bazel/_bazel_root/cache
2728
- /root/.cache/bazel/_bazel_root/install
2829
- /root/micromamba/pkgs
2930
when: manual
30-
#only:
31-
#- tags
3231

core/BUILD

-2
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,6 @@ cc_binary(
148148
linkopts = [
149149
"-Wl,-rpath,$$ORIGIN/../libs",
150150
"-Wl,-rpath,$$ORIGIN/../../../../../lib",
151-
"-Wl,-rpath,$$ORIGIN/../../tensorflow",
152-
"-Wl,-rpath,$$ORIGIN/../../tensorflow/python"
153151
],
154152
linkshared = 1,
155153
)

core/ps/optimizer/ada_grad_kernel.cc

+13-1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ SparseAdaGradValue::SparseAdaGradValue(int dim, const AdaGrad* opt) {
9595
}
9696

9797
g2sum_ = opt->initial_g2sum;
98+
old_compat_ = false;
99+
no_show_days_ = 0;
98100
}
99101

100102
void SparseAdaGradValue::Apply(const AdaGrad* opt, SparseGradInfo& grad_info, int dim) {
@@ -117,33 +119,43 @@ void SparseAdaGradValue::Apply(const AdaGrad* opt, SparseGradInfo& grad_info, in
117119
}
118120

119121
void SparseAdaGradValue::SerializeTxt_(std::ostream& os, int dim) {
122+
os << dim << "\t";
120123
for (int i = 0; i < dim; i++) {
121124
os << Weight()[i] << "\t";
122125
}
123126

124127
os << g2sum_ << "\t";
125-
os << show_;
128+
os << show_ << "\t";
129+
os << no_show_days_;
126130
}
127131

128132
void SparseAdaGradValue::DeSerializeTxt_(std::istream& is, int dim) {
133+
is >> dim_;
129134
for (int i = 0; i < dim; i++) {
130135
is >> Weight()[i];
131136
}
132137

133138
is >> g2sum_;
134139
is >> show_;
140+
if(!old_compat_) {
141+
is >> no_show_days_;
142+
}
135143
}
136144

137145
void SparseAdaGradValue::SerializeBin_(std::ostream& os, int dim) {
138146
os.write(reinterpret_cast<const char*>(Weight()), dim * sizeof(float));
139147
os.write(reinterpret_cast<const char*>(&g2sum_), sizeof(g2sum_));
140148
os.write(reinterpret_cast<const char*>(&show_), sizeof(show_));
149+
os.write(reinterpret_cast<const char*>(&no_show_days_), sizeof(no_show_days_));
141150
}
142151

143152
void SparseAdaGradValue::DeSerializeBin_(std::istream& is, int dim) {
144153
is.read(reinterpret_cast<char*>(Weight()), dim * sizeof(float));
145154
is.read(reinterpret_cast<char*>(&g2sum_), sizeof(g2sum_));
146155
is.read(reinterpret_cast<char*>(&show_), sizeof(show_));
156+
if(!old_compat_) {
157+
is.read(reinterpret_cast<char*>(&no_show_days_), sizeof(no_show_days_));
158+
}
147159
}
148160

149161
void SparseAdaGradValue::ShowDecay(const AdaGrad* opt, int delta_days) {

core/ps/optimizer/ada_grad_kernel.h

+1
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ class alignas(4) SparseAdaGradValue
8585
virtual void DeSerializeBin_(std::istream& is, int dim);
8686

8787
private:
88+
int dim_;
8889
float g2sum_;
8990
float show_ = 0.0;
9091
int no_show_days_ = 0;

core/ps/optimizer/data_struct.cc

+1
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,6 @@ void SparseOptValue::DeSerialize(std::istream& is, int dim) {
4242
}
4343
}
4444

45+
4546
} // namespace tensornet
4647

core/ps/optimizer/data_struct.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ class alignas(4) SparseOptValue {
4040

4141
void DeSerialize(std::istream& is, int dim);
4242

43+
void SetOldCompat(bool old_compat) {
44+
old_compat_ = old_compat;
45+
}
46+
4347
float Show() const {
4448
return show_;
4549
}
@@ -53,7 +57,7 @@ class alignas(4) SparseOptValue {
5357
protected:
5458
float show_ = 0.0;
5559
int delta_show_ = 0;
56-
60+
bool old_compat_ = false;
5761
};
5862

5963
} // namespace tensornet {

core/ps/optimizer/optimizer.h

+29
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <vector>
1919
#include <memory>
2020
#include <string>
21+
#include <iostream>
22+
#include <sstream>
2123

2224
namespace tensornet {
2325

@@ -36,6 +38,11 @@ class OptimizerBase {
3638

3739
virtual std::string Name() const = 0;
3840

41+
virtual std::tuple<bool, std::string> NeedOldCompat(std::istream& is, int dim) const {
42+
std::string emptyString = "";
43+
return std::make_tuple(false, emptyString);
44+
}
45+
3946
public:
4047
float learning_rate = 0.01;
4148
float show_decay_rate = 0.98;
@@ -70,6 +77,28 @@ class AdaGrad : public OptimizerBase {
7077
return "AdaGrad";
7178
}
7279

80+
std::tuple<bool, std::string> NeedOldCompat(std::istream& is, int dim) const {
81+
bool need_old_compat = false;
82+
std::string line;
83+
std::string cell;
84+
std::getline(is, line); // 抹去换行符
85+
std::getline(is, line);
86+
std::istringstream iss(line);
87+
int column_count = 0;
88+
89+
while (std::getline(iss, cell, '\t')) {
90+
++column_count;
91+
}
92+
93+
// columns should be sign, dim_, dims_ * weight, g2sum, show, no_show_days
94+
// if columnCount is 12, means no no_show_days column
95+
if(column_count == dim + 4){
96+
need_old_compat = true;
97+
}
98+
99+
return std::make_tuple(need_old_compat, line);
100+
}
101+
73102
public:
74103
float initial_g2sum = 0;
75104
float initial_scale = 1.0;

core/ps/optimizer/optimizer_kernel.h

+19-1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
#include <butil/iobuf.h>
2727
#include <butil/logging.h>
2828
#include <Eigen/Dense>
29+
#include <cstring>
30+
#include <cstdio>
2931

3032
#include <boost/iostreams/stream.hpp>
3133

@@ -389,10 +391,22 @@ class SparseKernelBlock {
389391
<< " you must make sure that use same optimizer when incremental training";
390392

391393
is.ignore(std::numeric_limits<std::streamsize>::max(), ':') >> block.dim_;
394+
395+
std::tuple<bool, std::string> tuple = block.opt_->NeedOldCompat(is, block.dim_);
396+
bool need_old_compat = std::get<0>(tuple);
397+
std::string sample_line = std::get<1>(tuple);
398+
std::istringstream sample_is(sample_line);
392399

393400
uint64_t sign = 0;
401+
while (sample_is >> sign) {
402+
ValueType* value = block.alloc_.allocate(block.dim_, block.opt_);
403+
value->SetOldCompat(need_old_compat);
404+
value->DeSerialize(sample_is, block.dim_);
405+
block.values_[sign] = value;
406+
}
394407
while (is >> sign) {
395408
ValueType* value = block.alloc_.allocate(block.dim_, block.opt_);
409+
value->SetOldCompat(need_old_compat);
396410
value->DeSerialize(is, block.dim_);
397411
block.values_[sign] = value;
398412
}
@@ -495,7 +509,11 @@ class SparseOptimizerKernel : public SparseOptimizerKernelBase {
495509
for (size_t i = 0; i < SPARSE_KERNEL_BLOCK_NUM; ++i) {
496510
threads.push_back(std::thread([this, i, &mode, &filepath]() {
497511
std::string file = filepath;
498-
file.append("/block_").append(std::to_string(i)).append(".gz");
512+
if(FileUtils::CheckFileExists(filepath + "/block_" + std::to_string(i) + ".gz")){
513+
file.append("/block_").append(std::to_string(i)).append(".gz");
514+
} else {
515+
file.append("/sparse_block_").append(std::to_string(i)).append(".gz");
516+
}
499517

500518
FileReaderSource reader_source(file, FCT_ZLIB);
501519
boost::iostreams::stream<FileReaderSource> in_stream(reader_source);

core/ps/table/sparse_table.cc

+5-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,11 @@ void SparseTable::Load(const std::string& filepath, const std::string& mode) {
116116
if (name_.empty()) {
117117
file += std::to_string(GetHandle());
118118
} else {
119-
file += name_;
119+
if(FileUtils::CheckFileExists(file + name_)){
120+
file += name_;
121+
} else {
122+
file += std::to_string(GetHandle());
123+
}
120124
}
121125

122126
file += "/rank_" + std::to_string(self_shard_id_);

core/utility/file_io.cc

+4-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ FileReaderSource::FileReaderSource(const std::string& file,
139139
reader_ = std::make_shared<ReaderInternal>(reader.release(), compression_type);
140140
}
141141

142-
143142
FileReaderSource::~FileReaderSource() {
144143
reader_ = nullptr;
145144
}
@@ -162,5 +161,9 @@ std::streamsize FileReaderSource::read(char_type* str, std::streamsize n) {
162161
return buffer.size();
163162
}
164163

164+
bool FileUtils::CheckFileExists(const std::string& filepath) {
165+
return tensorflow::Env::Default()-> FileExists(filepath).ok();
166+
}
167+
165168
} // namespace tensornet
166169

core/utility/file_io.h

+6
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ class FileReaderSource {
7070

7171
};
7272

73+
class FileUtils {
74+
public:
75+
static bool CheckFileExists(const std::string& filepath);
76+
77+
};
78+
7379
} // namespace tensornet
7480

7581
#endif // TENSORNET_UTILITY_SEMAPHORE_H_

manager

+12-1
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,14 @@ start_copy_libs(){
8282
}
8383

8484
start_test(){
85-
python -c "import tensorflow as tf;import tensornet as tn;tn.core.init()"
85+
[[ ${NEED_ACTIVATE_ENV} == true ]] && _activate_env
86+
export PYTHONPATH=${WORKSPACE_DIR}:${PYTHONPATH}
87+
MPI_LIB_PATH=$(ompi_info --parsable --path prefix 2>/dev/null | awk -F":" '{print $NF}')
88+
export LD_LIBRARY_PATH=${MPI_LIB_PATH}/lib:${LD_LIBRARY_PATH}
89+
cd examples
90+
rm -rf data model || true
91+
python gen_example_data.py
92+
python main.py
8693
}
8794

8895

@@ -139,6 +146,10 @@ case "$1" in
139146
shift 1
140147
start_create_dist "$@"
141148
;;
149+
(test)
150+
shift 1
151+
start_test "$@"
152+
;;
142153
(help)
143154
cmd=$(basename -- "$0")
144155
cat <<-END

tensornet/callbacks/callbacks.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class PsWeightCheckpoint(Callback):
2424
"""
2525

2626
def __init__(self, checkpoint_dir, checkpoint_save=None, need_save_model=False, dt=None, delta_days=0, save_mode="txt",
27-
model_path_incl_dt=False):
27+
model_path_incl_dt=False, **kwargs):
2828
"""
2929
:param checkpoint_dir: path of save model
3030
:param need_save_model: whether save model
@@ -34,6 +34,7 @@ def __init__(self, checkpoint_dir, checkpoint_save=None, need_save_model=False,
3434
self.checkpoint_dir = checkpoint_dir
3535
self.checkpoint_save = checkpoint_save if checkpoint_save else checkpoint_dir
3636
self.need_save_model = need_save_model
37+
self.need_load_model = kwargs.get('need_load_model', True)
3738
self.save_mode = save_mode
3839
self.model_path_incl_dt = model_path_incl_dt
3940
self.dt = dt
@@ -43,7 +44,8 @@ def __init__(self, checkpoint_dir, checkpoint_save=None, need_save_model=False,
4344

4445
def load_model(self):
4546
tn.core.barrier()
46-
self.model.load_weights(self.checkpoint_dir, include_dt=self.model_path_incl_dt, mode=self.save_mode)
47+
if self.need_load_model:
48+
self.model.load_weights(self.checkpoint_dir, include_dt=self.model_path_incl_dt, mode=self.save_mode)
4749
tn.core.barrier()
4850

4951
def reset_balance_dataset(self):

tensornet/feature_column/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from .category_column import *
15+
from .category_column import *
16+
from .sequence_category_column import *

0 commit comments

Comments
 (0)