feat(stats): added service statistics mechanism

beniz · sileht · commit 1839e4a3451a · 2020-10-01T08:35:41.000+02:00
the initial version returns:

    inference_count
    predict_success
    predict_failure
    predict_count
    avg_batch_size
    avg_predict_duration
    avg_transform_duration
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -7,7 +7,12 @@ if (USE_HDF5)
   add_definitions(-DUSE_HDF5)
 endif()
 
-set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc svminputfileconn.h svminputfileconn.cc txtinputfileconn.h txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
+set(ddetect_SOURCES deepdetect.h deepdetect.cc mllibstrategy.h mlmodel.h
+    mlservice.h inputconnectorstrategy.h imginputfileconn.h csvinputfileconn.h
+    csvinputfileconn.cc csvtsinputfileconn.h csvtsinputfileconn.cc
+    svminputfileconn.h svminputfileconn.cc txtinputfileconn.h
+    txtinputfileconn.cc apidata.h apidata.cc chain_actions.h chain_actions.cc
+    service_stats.h service_stats.cc chain.h chain.cc ext/rmustache/mustache.h ext/rmustache/mustache.cc)
 if (USE_JSON_API)
   list(APPEND ddetect_SOURCES jsonapi.h jsonapi.cc)
 endif()
diff --git a/src/backends/caffe/caffelib.cc b/src/backends/caffe/caffelib.cc
@@ -2810,6 +2810,7 @@ namespace dd
     cad.add("has_mean_file", has_mean_file);
     if (ad_output.has("measure"))
       {
+        // FIXME(sileht): Should we create service_stats here ?
         try
           {
             inputc.transform(cad);
@@ -2842,15 +2843,13 @@ namespace dd
     if (ad.has("chain") && ad.get("chain").get<bool>())
       cad.add("chain", true);
 
-    try
-      {
-        inputc.transform(cad);
-      }
-    catch (std::exception &e)
-      {
-        throw;
-      }
+    this->_stats.transform_start();
+    inputc.transform(cad);
+    this->_stats.transform_end();
+
     int batch_size = inputc.test_batch_size();
+    this->_stats.inc_inference_count(batch_size);
+
     if (ad_mllib.has("net"))
       {
         APIData ad_net = ad_mllib.getobj("net");
diff --git a/src/backends/dlib/dliblib.cc b/src/backends/dlib/dliblib.cc
@@ -141,6 +141,8 @@ namespace dd
     TOutputConnectorStrategy tout(this->_outputc);
     APIData cad = ad;
     cad.add("model_repo", this->_mlmodel._repo);
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -149,9 +151,12 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
 
     APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
     int batch_size = inputc.batch_size();
+    this->_stats.inc_inference_count(batch_size);
+
     if (ad_mllib.has("test_batch_size"))
       {
         batch_size = ad_mllib.get("test_batch_size").get<int>();
diff --git a/src/backends/ncnn/ncnnlib.cc b/src/backends/ncnn/ncnnlib.cc
@@ -177,6 +177,8 @@ namespace dd
 
     TInputConnectorStrategy inputc(this->_inputc);
     TOutputConnectorStrategy tout(this->_outputc);
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(ad);
@@ -185,6 +187,9 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
+
+    this->_stats.inc_inference_count(inputc._ids.size());
 
     // if height (timestep) changes we need to clear net before recreating an
     // extractor with new height, and to reload params and models after clear()
diff --git a/src/backends/tensorrt/tensorrtinputconns.cc b/src/backends/tensorrt/tensorrtinputconns.cc
@@ -82,6 +82,7 @@ namespace dd
         _imgs_size.insert(std::pair<std::string, std::pair<int, int>>(
             this->_ids.at(i), this->_images_size.at(i)));
       }
+    _batch_size = this->_images.size();
     _batch_index = 0;
   }
 
diff --git a/src/backends/tensorrt/tensorrtinputconns.h b/src/backends/tensorrt/tensorrtinputconns.h
@@ -90,6 +90,7 @@ namespace dd
     std::string _meanfname = "mean.binaryproto";
     std::string _correspname = "corresp.txt";
     int _batch_index = 0;
+    int _batch_size = 0;
     int process_batch(const unsigned int batch_size);
     std::unordered_map<std::string, std::pair<int, int>>
         _imgs_size; /**< image sizes, used in detection. */
diff --git a/src/backends/tensorrt/tensorrtlib.cc b/src/backends/tensorrt/tensorrtlib.cc
@@ -464,6 +464,7 @@ namespace dd
     APIData cad = ad;
 
     TOutputConnectorStrategy tout(this->_outputc);
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -472,6 +473,9 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
+
+    this->_stats.inc_inference_count(inputc._batch_size);
 
     int idoffset = 0;
     std::vector<APIData> vrad;
diff --git a/src/backends/tf/tflib.cc b/src/backends/tf/tflib.cc
@@ -329,6 +329,8 @@ namespace dd
     TOutputConnectorStrategy tout;
     APIData cad = ad;
     cad.add("model_repo", this->_mlmodel._repo);
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -337,9 +339,11 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
 
     APIData ad_mllib = ad.getobj("parameters").getobj("mllib");
     int batch_size = inputc.batch_size();
+    this->_stats.inc_inference_count(batch_size);
     if (ad_mllib.has("test_batch_size"))
       batch_size = ad_mllib.get("test_batch_size").get<int>();
 
diff --git a/src/backends/torch/torchlib.cc b/src/backends/torch/torchlib.cc
@@ -37,8 +37,8 @@
 #include "generators/net_caffe.h"
 #include "generators/net_caffe_recurrent.h"
 
-#include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <fcntl.h>
 
 #include "native/native.h"
@@ -1020,6 +1020,7 @@ namespace dd
     if (_module._native != nullptr)
       _module._native->update_input_connector(inputc);
 
+    this->_stats.transform_start();
     TOutputConnectorStrategy outputc(this->_outputc);
     try
       {
@@ -1039,6 +1040,8 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
+
     torch::Device cpu("cpu");
     _module.eval();
 
@@ -1074,6 +1077,8 @@ namespace dd
           {
             in_vals.push_back(tensor.to(_device));
           }
+        this->_stats.inc_inference_count(in_vals.size());
+
         Tensor output;
         try
           {
@@ -1336,4 +1341,4 @@ namespace dd
   template class TorchLib<TxtTorchInputFileConn, SupervisedOutput, TorchModel>;
   template class TorchLib<CSVTSTorchInputFileConn, SupervisedOutput,
                           TorchModel>;
-}
+} // namespace dd
diff --git a/src/backends/xgb/xgblib.cc b/src/backends/xgb/xgblib.cc
@@ -452,6 +452,8 @@ namespace dd
     // data
     TInputConnectorStrategy inputc(this->_inputc);
     APIData cad = ad;
+
+    this->_stats.transform_start();
     try
       {
         inputc.transform(cad);
@@ -460,6 +462,7 @@ namespace dd
       {
         throw;
       }
+    this->_stats.transform_end();
 
     // load existing model as needed
     if (!_learner)
@@ -503,6 +506,7 @@ namespace dd
     // results
     // float loss = 0.0; // XXX: how to acquire loss ?
     int batch_size = preds.Size();
+    this->_stats.inc_inference_count(batch_size);
     int nclasses = _nclasses;
     if (_objective == "multi:softprob")
       batch_size /= nclasses;
diff --git a/src/jsonapi.cc b/src/jsonapi.cc
@@ -1575,8 +1575,6 @@ namespace dd
     out.toJVal(jpred, jout);
     JVal jhead(rapidjson::kObjectType);
     jhead.AddMember("method", "/chain", jpred.GetAllocator());
-    // jhead.AddMember("service",d["service"],jpred.GetAllocator());
-    // if (!has_measure)
     jhead.AddMember("time", jout["time"], jpred.GetAllocator());
     jpred.AddMember("head", jhead, jpred.GetAllocator());
     JVal jbody(rapidjson::kObjectType);
diff --git a/src/mllibstrategy.h b/src/mllibstrategy.h
@@ -23,6 +23,7 @@
 #define MLLIBSTRATEGY_H
 
 #include "apidata.h"
+#include "service_stats.h"
 #include "utils/fileops.hpp"
 #include "dd_spdlog.h"
 #include <atomic>
@@ -94,7 +95,7 @@ namespace dd
     MLLib(MLLib &&mll) noexcept
         : _inputc(mll._inputc), _outputc(mll._outputc), _mltype(mll._mltype),
           _mlmodel(mll._mlmodel), _meas(mll._meas),
-          _meas_per_iter(mll._meas_per_iter),
+          _meas_per_iter(mll._meas_per_iter), _stats(mll._stats),
           _tjob_running(mll._tjob_running.load()), _logger(mll._logger)
     {
     }
@@ -298,8 +299,9 @@ namespace dd
       int c = 0;
       for (double l : vl)
         {
-          std::string measl = meas + '_' + cnames.at(c); // std::to_string(c);
-          auto hit = _meas.find(measl);
+          std::string measl = meas + '_' + cnames.at(c);
+          auto hit = _meas.find(
+              measl); // not reusing add_meas since need a global lock
           if (hit != _meas.end())
             (*hit).second = l;
           else
@@ -309,7 +311,7 @@ namespace dd
     }
 
     /**
-     * \brief get currentvalue of argument measure
+     * \brief get current value of argument measure
      * @param meas measure name
      * @return current value of measure
      */
@@ -379,15 +381,12 @@ namespace dd
     std::unordered_map<std::string, std::vector<double>>
         _meas_per_iter; /**< model measures per iteration. */
 
+    ServiceStats _stats; /**< service statistics/metrics .*/
+
     std::atomic<bool> _tjob_running = {
       false
     }; /**< whether a training job is running with this lib instance. */
 
-    bool _online
-        = false; /**< whether the algorithm is online, i.e. it interleaves
-                    training and prediction calls. When not, prediction calls
-                    are rejected while training is running. */
-
     std::shared_ptr<spdlog::logger> _logger; /**< mllib logger. */
 
     long int _model_flops = 0;    /**< model flops. */
@@ -398,7 +397,7 @@ namespace dd
   protected:
     mutable std::mutex
         _meas_per_iter_mutex;         /**< mutex over measures history. */
-    mutable std::mutex _meas_mutex;   /** mutex around current measures. */
+    mutable std::mutex _meas_mutex;   /**< mutex around current measures. */
     const int _max_meas_points = 1e7; // 10M points max per measure
   };
 
diff --git a/src/mlservice.h b/src/mlservice.h
@@ -248,6 +248,7 @@ namespace dd
               ad.add("height", this->_inputc.height());
             }
         }
+      this->_stats.to(ad);
       return ad;
     }
 
@@ -283,7 +284,9 @@ namespace dd
                 static_cast<long int>(this->_mem_used_train * sizeof(float)));
       stats.add("data_mem_test",
                 static_cast<long int>(this->_mem_used_test * sizeof(float)));
-      ad.add("stats", stats);
+      ad.add("stats", stats); // FIXME(sileht): deprecated name, delete me when
+                              // platform use the new name
+      ad.add("model_stats", stats);
       ad.add("jobs", vad);
       ad.add("parameters", _init_parameters);
       ad.add("repository", this->_inputc._model_repo);
@@ -292,6 +295,7 @@ namespace dd
         ad.add("type", std::string("unsupervised"));
       else
         ad.add("type", std::string("supervised"));
+      this->_stats.to(ad);
       return ad;
     }
 
@@ -495,36 +499,29 @@ namespace dd
      */
     int predict_job(const APIData &ad, APIData &out, const bool &chain = false)
     {
-      // TODO: collect input transformed data for chain, store it here in
-      // memory
-      // -> beware, the input connector is a copy...
+      if (!_train_mutex.try_lock_shared())
+        throw MLServiceLockException(
+            "Predict call while training with an offline learning algorithm");
 
-      if (!this->_online)
+      this->_stats.predict_start();
+
+      int err = 0;
+      try
         {
-          if (!_train_mutex.try_lock_shared())
-            throw MLServiceLockException("Predict call while training with an "
-                                         "offline learning algorithm");
-          int err = 0;
-          try
-            {
-              if (chain)
-                const_cast<APIData &>(ad).add("chain", true);
-              err = this->predict(ad, out);
-            }
-          catch (std::exception &e)
-            {
-              _train_mutex.unlock_shared();
-              throw;
-            }
-          _train_mutex.unlock_shared();
-          return err;
+          if (chain)
+            const_cast<APIData &>(ad).add("chain", true);
+          err = this->predict(ad, out);
         }
-      else // wait til a lock can be acquired
+      catch (std::exception &e)
         {
-          boost::shared_lock<boost::shared_mutex> lock(_train_mutex);
-          return this->predict(ad, out);
+          _train_mutex.unlock_shared();
+          this->_stats.predict_end(false);
+          throw;
         }
-      return 0;
+      this->_stats.predict_end(true);
+
+      _train_mutex.unlock_shared();
+      return err;
     }
 
     std::string _sname;       /**< service name. */
diff --git a/src/service_stats.cc b/src/service_stats.cc
diff --git a/src/service_stats.h b/src/service_stats.h
diff --git a/tests/ut-jsonapi.cc b/tests/ut-jsonapi.cc
diff --git a/tests/ut-torchapi.cc b/tests/ut-torchapi.cc

Original file line number	Diff line number	Diff line change
`@@ -141,6 +141,8 @@ namespace dd`
`141`	`141`	`TOutputConnectorStrategy tout(this->_outputc);`
`142`	`142`	`APIData cad = ad;`
`143`	`143`	`cad.add("model_repo", this->_mlmodel._repo);`
	`144`	`+`
	`145`	`+ this->_stats.transform_start();`
`144`	`146`	`try`
`145`	`147`	`{`
`146`	`148`	`inputc.transform(cad);`
`@@ -149,9 +151,12 @@ namespace dd`
`149`	`151`	`{`
`150`	`152`	`throw;`
`151`	`153`	`}`
	`154`	`+ this->_stats.transform_end();`
`152`	`155`
`153`	`156`	`APIData ad_mllib = ad.getobj("parameters").getobj("mllib");`
`154`	`157`	`int batch_size = inputc.batch_size();`
	`158`	`+ this->_stats.inc_inference_count(batch_size);`
	`159`	`+`
`155`	`160`	`if (ad_mllib.has("test_batch_size"))`
`156`	`161`	`{`
`157`	`162`	`batch_size = ad_mllib.get("test_batch_size").get<int>();`
Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,8 @@ namespace dd`
`177`	`177`
`178`	`178`	`TInputConnectorStrategy inputc(this->_inputc);`
`179`	`179`	`TOutputConnectorStrategy tout(this->_outputc);`
	`180`	`+`
	`181`	`+ this->_stats.transform_start();`
`180`	`182`	`try`
`181`	`183`	`{`
`182`	`184`	`inputc.transform(ad);`
`@@ -185,6 +187,9 @@ namespace dd`
`185`	`187`	`{`
`186`	`188`	`throw;`
`187`	`189`	`}`
	`190`	`+ this->_stats.transform_end();`
	`191`	`+`
	`192`	`+ this->_stats.inc_inference_count(inputc._ids.size());`
`188`	`193`
`189`	`194`	`// if height (timestep) changes we need to clear net before recreating an`
`190`	`195`	`// extractor with new height, and to reload params and models after clear()`
Original file line number	Diff line number	Diff line change
`@@ -82,6 +82,7 @@ namespace dd`
`82`	`82`	`_imgs_size.insert(std::pair<std::string, std::pair<int, int>>(`
`83`	`83`	`this->_ids.at(i), this->_images_size.at(i)));`
`84`	`84`	`}`
	`85`	`+ _batch_size = this->_images.size();`
`85`	`86`	`_batch_index = 0;`
`86`	`87`	`}`
`87`	`88`
Original file line number	Diff line number	Diff line change
`@@ -464,6 +464,7 @@ namespace dd`
`464`	`464`	`APIData cad = ad;`
`465`	`465`
`466`	`466`	`TOutputConnectorStrategy tout(this->_outputc);`
	`467`	`+ this->_stats.transform_start();`
`467`	`468`	`try`
`468`	`469`	`{`
`469`	`470`	`inputc.transform(cad);`
`@@ -472,6 +473,9 @@ namespace dd`
`472`	`473`	`{`
`473`	`474`	`throw;`
`474`	`475`	`}`
	`476`	`+ this->_stats.transform_end();`
	`477`	`+`
	`478`	`+ this->_stats.inc_inference_count(inputc._batch_size);`
`475`	`479`
`476`	`480`	`int idoffset = 0;`
`477`	`481`	`std::vector<APIData> vrad;`
Original file line number	Diff line number	Diff line change
`@@ -329,6 +329,8 @@ namespace dd`
`329`	`329`	`TOutputConnectorStrategy tout;`
`330`	`330`	`APIData cad = ad;`
`331`	`331`	`cad.add("model_repo", this->_mlmodel._repo);`
	`332`	`+`
	`333`	`+ this->_stats.transform_start();`
`332`	`334`	`try`
`333`	`335`	`{`
`334`	`336`	`inputc.transform(cad);`
`@@ -337,9 +339,11 @@ namespace dd`
`337`	`339`	`{`
`338`	`340`	`throw;`
`339`	`341`	`}`
	`342`	`+ this->_stats.transform_end();`
`340`	`343`
`341`	`344`	`APIData ad_mllib = ad.getobj("parameters").getobj("mllib");`
`342`	`345`	`int batch_size = inputc.batch_size();`
	`346`	`+ this->_stats.inc_inference_count(batch_size);`
`343`	`347`	`if (ad_mllib.has("test_batch_size"))`
`344`	`348`	`batch_size = ad_mllib.get("test_batch_size").get<int>();`
`345`	`349`
Original file line number	Diff line number	Diff line change
`@@ -37,8 +37,8 @@`
`37`	`37`	`#include "generators/net_caffe.h"`
`38`	`38`	`#include "generators/net_caffe_recurrent.h"`
`39`	`39`
`40`		`-#include <sys/types.h>`
`41`	`40`	`#include <sys/stat.h>`
	`41`	`+#include <sys/types.h>`
`42`	`42`	`#include <fcntl.h>`
`43`	`43`
`44`	`44`	`#include "native/native.h"`
`@@ -1020,6 +1020,7 @@ namespace dd`
`1020`	`1020`	`if (_module._native != nullptr)`
`1021`	`1021`	`_module._native->update_input_connector(inputc);`
`1022`	`1022`
	`1023`	`+ this->_stats.transform_start();`
`1023`	`1024`	`TOutputConnectorStrategy outputc(this->_outputc);`
`1024`	`1025`	`try`
`1025`	`1026`	`{`
`@@ -1039,6 +1040,8 @@ namespace dd`
`1039`	`1040`	`{`
`1040`	`1041`	`throw;`
`1041`	`1042`	`}`
	`1043`	`+ this->_stats.transform_end();`
	`1044`	`+`
`1042`	`1045`	`torch::Device cpu("cpu");`
`1043`	`1046`	`_module.eval();`
`1044`	`1047`
`@@ -1074,6 +1077,8 @@ namespace dd`
`1074`	`1077`	`{`
`1075`	`1078`	`in_vals.push_back(tensor.to(_device));`
`1076`	`1079`	`}`
	`1080`	`+ this->_stats.inc_inference_count(in_vals.size());`
	`1081`	`+`
`1077`	`1082`	`Tensor output;`
`1078`	`1083`	`try`
`1079`	`1084`	`{`
`@@ -1336,4 +1341,4 @@ namespace dd`
`1336`	`1341`	`template class TorchLib<TxtTorchInputFileConn, SupervisedOutput, TorchModel>;`
`1337`	`1342`	`template class TorchLib<CSVTSTorchInputFileConn, SupervisedOutput,`
`1338`	`1343`	`TorchModel>;`
`1339`		`-}`
	`1344`	`+} // namespace dd`