Skip to content

Commit 1839e4a

Browse files
benizsileht
authored andcommitted
feat(stats): added service statistics mechanism
the initial version returns: inference_count predict_success predict_failure predict_count avg_batch_size avg_predict_duration avg_transform_duration
1 parent 4558ed8 commit 1839e4a

17 files changed

+276
-49
lines changed

src/CMakeLists.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,12 @@ if (USE_HDF5)
77
add_definitions(-DUSE_HDF5)
88
endif()
99

10-
set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc svminputfileconn.h svminputfileconn.cc txtinputfileconn.h txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
10+
set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h
11+
mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h
12+
csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc
13+
svminputfileconn.h svminputfileconn.cc txtinputfileconn.h
14+
txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc
15+
service_stats.h service_stats.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
1116
if (USE_JSON_API)
1217
list(APPEND ddetect_SOURCES jsonapi.h jsonapi.cc)
1318
endif()

src/backends/caffe/caffelib.cc

+7-8
Original file line numberDiff line numberDiff line change
@@ -2810,6 +2810,7 @@ namespace dd
28102810
cad.add("has_mean_file", has_mean_file);
28112811
if (ad_output.has("measure"))
28122812
{
2813+
// FIXME(sileht): Should we create service_stats here ?
28132814
try
28142815
{
28152816
inputc.transform(cad);
@@ -2842,15 +2843,13 @@ namespace dd
28422843
if (ad.has("chain") && ad.get("chain").get<bool>())
28432844
cad.add("chain", true);
28442845

2845-
try
2846-
{
2847-
inputc.transform(cad);
2848-
}
2849-
catch (std::exception &e)
2850-
{
2851-
throw;
2852-
}
2846+
this->_stats.transform_start();
2847+
inputc.transform(cad);
2848+
this->_stats.transform_end();
2849+
28532850
int batch_size = inputc.test_batch_size();
2851+
this->_stats.inc_inference_count(batch_size);
2852+
28542853
if (ad_mllib.has("net"))
28552854
{
28562855
APIData ad_net = ad_mllib.getobj("net");

src/backends/dlib/dliblib.cc

+5
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,8 @@ namespace dd
141141
TOutputConnectorStrategy tout(this->_outputc);
142142
APIData cad = ad;
143143
cad.add("model_repo", this->_mlmodel._repo);
144+
145+
this->_stats.transform_start();
144146
try
145147
{
146148
inputc.transform(cad);
@@ -149,9 +151,12 @@ namespace dd
149151
{
150152
throw;
151153
}
154+
this->_stats.transform_end();
152155

153156
APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
154157
int batch_size = inputc.batch_size();
158+
this->_stats.inc_inference_count(batch_size);
159+
155160
if (ad_mllib.has("test_batch_size"))
156161
{
157162
batch_size = ad_mllib.get("test_batch_size").get<int>();

src/backends/ncnn/ncnnlib.cc

+5
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ namespace dd
177177

178178
TInputConnectorStrategy inputc(this->_inputc);
179179
TOutputConnectorStrategy tout(this->_outputc);
180+
181+
this->_stats.transform_start();
180182
try
181183
{
182184
inputc.transform(ad);
@@ -185,6 +187,9 @@ namespace dd
185187
{
186188
throw;
187189
}
190+
this->_stats.transform_end();
191+
192+
this->_stats.inc_inference_count(inputc._ids.size());
188193

189194
// if height (timestep) changes we need to clear net before recreating an
190195
// extractor with new height, and to reload params and models after clear()

src/backends/tensorrt/tensorrtinputconns.cc

+1
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ namespace dd
8282
_imgs_size.insert(std::pair<std::string, std::pair<int, int>>(
8383
this->_ids.at(i), this->_images_size.at(i)));
8484
}
85+
_batch_size = this->_images.size();
8586
_batch_index = 0;
8687
}
8788

src/backends/tensorrt/tensorrtinputconns.h

+1
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ namespace dd
9090
std::string _meanfname = "mean.binaryproto";
9191
std::string _correspname = "corresp.txt";
9292
int _batch_index = 0;
93+
int _batch_size = 0;
9394
int process_batch(const unsigned int batch_size);
9495
std::unordered_map<std::string, std::pair<int, int>>
9596
_imgs_size; /**< image sizes, used in detection. */

src/backends/tensorrt/tensorrtlib.cc

+4
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,7 @@ namespace dd
464464
APIData cad = ad;
465465

466466
TOutputConnectorStrategy tout(this->_outputc);
467+
this->_stats.transform_start();
467468
try
468469
{
469470
inputc.transform(cad);
@@ -472,6 +473,9 @@ namespace dd
472473
{
473474
throw;
474475
}
476+
this->_stats.transform_end();
477+
478+
this->_stats.inc_inference_count(inputc._batch_size);
475479

476480
int idoffset = 0;
477481
std::vector<APIData> vrad;

src/backends/tf/tflib.cc

+4
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,8 @@ namespace dd
329329
TOutputConnectorStrategy tout;
330330
APIData cad = ad;
331331
cad.add("model_repo", this->_mlmodel._repo);
332+
333+
this->_stats.transform_start();
332334
try
333335
{
334336
inputc.transform(cad);
@@ -337,9 +339,11 @@ namespace dd
337339
{
338340
throw;
339341
}
342+
this->_stats.transform_end();
340343

341344
APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
342345
int batch_size = inputc.batch_size();
346+
this->_stats.inc_inference_count(batch_size);
343347
if (ad_mllib.has("test_batch_size"))
344348
batch_size = ad_mllib.get("test_batch_size").get<int>();
345349

src/backends/torch/torchlib.cc

+7-2
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@
3737
#include "generators/net_caffe.h"
3838
#include "generators/net_caffe_recurrent.h"
3939

40-
#include <sys/types.h>
4140
#include <sys/stat.h>
41+
#include <sys/types.h>
4242
#include <fcntl.h>
4343

4444
#include "native/native.h"
@@ -1020,6 +1020,7 @@ namespace dd
10201020
if (_module._native != nullptr)
10211021
_module._native->update_input_connector(inputc);
10221022

1023+
this->_stats.transform_start();
10231024
TOutputConnectorStrategy outputc(this->_outputc);
10241025
try
10251026
{
@@ -1039,6 +1040,8 @@ namespace dd
10391040
{
10401041
throw;
10411042
}
1043+
this->_stats.transform_end();
1044+
10421045
torch::Device cpu("cpu");
10431046
_module.eval();
10441047

@@ -1074,6 +1077,8 @@ namespace dd
10741077
{
10751078
in_vals.push_back(tensor.to(_device));
10761079
}
1080+
this->_stats.inc_inference_count(in_vals.size());
1081+
10771082
Tensor output;
10781083
try
10791084
{
@@ -1336,4 +1341,4 @@ namespace dd
13361341
template class TorchLib<TxtTorchInputFileConn, SupervisedOutput, TorchModel>;
13371342
template class TorchLib<CSVTSTorchInputFileConn, SupervisedOutput,
13381343
TorchModel>;
1339-
}
1344+
} // namespace dd

src/backends/xgb/xgblib.cc

+4
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,8 @@ namespace dd
452452
// data
453453
TInputConnectorStrategy inputc(this->_inputc);
454454
APIData cad = ad;
455+
456+
this->_stats.transform_start();
455457
try
456458
{
457459
inputc.transform(cad);
@@ -460,6 +462,7 @@ namespace dd
460462
{
461463
throw;
462464
}
465+
this->_stats.transform_end();
463466

464467
// load existing model as needed
465468
if (!_learner)
@@ -503,6 +506,7 @@ namespace dd
503506
// results
504507
// float loss = 0.0; // XXX: how to acquire loss ?
505508
int batch_size = preds.Size();
509+
this->_stats.inc_inference_count(batch_size);
506510
int nclasses = _nclasses;
507511
if (_objective == "multi:softprob")
508512
batch_size /= nclasses;

src/jsonapi.cc

-2
Original file line numberDiff line numberDiff line change
@@ -1575,8 +1575,6 @@ namespace dd
15751575
out.toJVal(jpred, jout);
15761576
JVal jhead(rapidjson::kObjectType);
15771577
jhead.AddMember("method", "/chain", jpred.GetAllocator());
1578-
// jhead.AddMember("service",d["service"],jpred.GetAllocator());
1579-
// if (!has_measure)
15801578
jhead.AddMember("time", jout["time"], jpred.GetAllocator());
15811579
jpred.AddMember("head", jhead, jpred.GetAllocator());
15821580
JVal jbody(rapidjson::kObjectType);

src/mllibstrategy.h

+9-10
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#define MLLIBSTRATEGY_H
2424

2525
#include "apidata.h"
26+
#include "service_stats.h"
2627
#include "utils/fileops.hpp"
2728
#include "dd_spdlog.h"
2829
#include <atomic>
@@ -94,7 +95,7 @@ namespace dd
9495
MLLib(MLLib &&mll) noexcept
9596
: _inputc(mll._inputc), _outputc(mll._outputc), _mltype(mll._mltype),
9697
_mlmodel(mll._mlmodel), _meas(mll._meas),
97-
_meas_per_iter(mll._meas_per_iter),
98+
_meas_per_iter(mll._meas_per_iter), _stats(mll._stats),
9899
_tjob_running(mll._tjob_running.load()), _logger(mll._logger)
99100
{
100101
}
@@ -298,8 +299,9 @@ namespace dd
298299
int c = 0;
299300
for (double l : vl)
300301
{
301-
std::string measl = meas + '_' + cnames.at(c); // std::to_string(c);
302-
auto hit = _meas.find(measl);
302+
std::string measl = meas + '_' + cnames.at(c);
303+
auto hit = _meas.find(
304+
measl); // not reusing add_meas since need a global lock
303305
if (hit != _meas.end())
304306
(*hit).second = l;
305307
else
@@ -309,7 +311,7 @@ namespace dd
309311
}
310312

311313
/**
312-
* \brief get currentvalue of argument measure
314+
* \brief get current value of argument measure
313315
* @param meas measure name
314316
* @return current value of measure
315317
*/
@@ -379,15 +381,12 @@ namespace dd
379381
std::unordered_map<std::string, std::vector<double>>
380382
_meas_per_iter; /**< model measures per iteration. */
381383

384+
ServiceStats _stats; /**< service statistics/metrics .*/
385+
382386
std::atomic<bool> _tjob_running = {
383387
false
384388
}; /**< whether a training job is running with this lib instance. */
385389

386-
bool _online
387-
= false; /**< whether the algorithm is online, i.e. it interleaves
388-
training and prediction calls. When not, prediction calls
389-
are rejected while training is running. */
390-
391390
std::shared_ptr<spdlog::logger> _logger; /**< mllib logger. */
392391

393392
long int _model_flops = 0; /**< model flops. */
@@ -398,7 +397,7 @@ namespace dd
398397
protected:
399398
mutable std::mutex
400399
_meas_per_iter_mutex; /**< mutex over measures history. */
401-
mutable std::mutex _meas_mutex; /** mutex around current measures. */
400+
mutable std::mutex _meas_mutex; /**< mutex around current measures. */
402401
const int _max_meas_points = 1e7; // 10M points max per measure
403402
};
404403

src/mlservice.h

+23-26
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ namespace dd
248248
ad.add("height", this->_inputc.height());
249249
}
250250
}
251+
this->_stats.to(ad);
251252
return ad;
252253
}
253254

@@ -283,7 +284,9 @@ namespace dd
283284
static_cast<long int>(this->_mem_used_train * sizeof(float)));
284285
stats.add("data_mem_test",
285286
static_cast<long int>(this->_mem_used_test * sizeof(float)));
286-
ad.add("stats", stats);
287+
ad.add("stats", stats); // FIXME(sileht): deprecated name, delete me when
288+
// platform use the new name
289+
ad.add("model_stats", stats);
287290
ad.add("jobs", vad);
288291
ad.add("parameters", _init_parameters);
289292
ad.add("repository", this->_inputc._model_repo);
@@ -292,6 +295,7 @@ namespace dd
292295
ad.add("type", std::string("unsupervised"));
293296
else
294297
ad.add("type", std::string("supervised"));
298+
this->_stats.to(ad);
295299
return ad;
296300
}
297301

@@ -495,36 +499,29 @@ namespace dd
495499
*/
496500
int predict_job(const APIData &ad, APIData &out, const bool &chain = false)
497501
{
498-
// TODO: collect input transformed data for chain, store it here in
499-
// memory
500-
// -> beware, the input connector is a copy...
502+
if (!_train_mutex.try_lock_shared())
503+
throw MLServiceLockException(
504+
"Predict call while training with an offline learning algorithm");
501505

502-
if (!this->_online)
506+
this->_stats.predict_start();
507+
508+
int err = 0;
509+
try
503510
{
504-
if (!_train_mutex.try_lock_shared())
505-
throw MLServiceLockException("Predict call while training with an "
506-
"offline learning algorithm");
507-
int err = 0;
508-
try
509-
{
510-
if (chain)
511-
const_cast<APIData &>(ad).add("chain", true);
512-
err = this->predict(ad, out);
513-
}
514-
catch (std::exception &e)
515-
{
516-
_train_mutex.unlock_shared();
517-
throw;
518-
}
519-
_train_mutex.unlock_shared();
520-
return err;
511+
if (chain)
512+
const_cast<APIData &>(ad).add("chain", true);
513+
err = this->predict(ad, out);
521514
}
522-
else // wait til a lock can be acquired
515+
catch (std::exception &e)
523516
{
524-
boost::shared_lock<boost::shared_mutex> lock(_train_mutex);
525-
return this->predict(ad, out);
517+
_train_mutex.unlock_shared();
518+
this->_stats.predict_end(false);
519+
throw;
526520
}
527-
return 0;
521+
this->_stats.predict_end(true);
522+
523+
_train_mutex.unlock_shared();
524+
return err;
528525
}
529526

530527
std::string _sname; /**< service name. */

0 commit comments

Comments
 (0)