@@ -1669,6 +1669,7 @@ namespace dd
1669
1669
{
1670
1670
if (_cifc)
1671
1671
{
1672
+ _cifc->_columns .clear ();
1672
1673
std::string test_file = _cifc->_csv_test_fname ;
1673
1674
_cifc->_csv_test_fname = " " ;
1674
1675
_cifc->read_csv (fname);
@@ -1697,63 +1698,100 @@ namespace dd
1697
1698
return 0 ;
1698
1699
}
1699
1700
1700
- int DDCCsvTS::read_dir (const std::string &dir, bool is_test_data,
1701
- bool update_bounds)
1701
+ int DDCCsvTS::read_dir (const std::string &dir)
1702
1702
{
1703
- // first recursive list csv files
1704
- std::unordered_set<std::string> allfiles ;
1705
- int ret = fileops::list_directory (dir, true , false , true , allfiles );
1703
+ // - list all CSV files in directory
1704
+ std::unordered_set<std::string> trainfiles ;
1705
+ int ret = fileops::list_directory (dir, true , false , true , trainfiles );
1706
1706
if (ret != 0 )
1707
1707
return ret;
1708
1708
// then simply read them
1709
1709
if (!_cifc)
1710
1710
return -1 ;
1711
1711
1712
- if (update_bounds && _cifc->_scale
1713
- && (_cifc->_min_vals .empty () || _cifc->_max_vals .empty ()))
1712
+ // - pick one file up and read header once
1713
+ std::string fname = (*trainfiles.begin ());
1714
+ std::ifstream csv_file (fname, std::ios::binary);
1715
+ if (!csv_file.is_open ())
1716
+ throw InputConnectorBadParamException (" cannot open file " + fname);
1717
+ std::string hline;
1718
+ std::getline (csv_file, hline);
1719
+ _cifc->read_header (hline);
1720
+
1721
+ // - read all test files
1722
+ std::unordered_set<std::string> testfiles;
1723
+ if (!_cifc->_csv_test_fname .empty ())
1724
+ fileops::list_directory (_cifc->_csv_test_fname , true , false , true ,
1725
+ testfiles);
1726
+
1727
+ std::unordered_set<std::string> allfiles = trainfiles;
1728
+
1729
+ // - aggregate all files = train + test
1730
+ allfiles.insert (testfiles.begin (), testfiles.end ());
1731
+
1732
+ // - read categoricals first if any as it affects the number of columns (and
1733
+ // thus bounds)
1734
+ if (!_cifc->_categoricals .empty ())
1714
1735
{
1715
- std::unordered_set<std::string> reallyallfiles;
1716
- ret = fileops::list_directory (_cifc->_csv_test_fname , true , false ,
1717
- true , reallyallfiles);
1718
- reallyallfiles.insert (allfiles.begin (), allfiles.end ());
1736
+ std::unordered_map<std::string, CCategorical> categoricals;
1737
+ for (auto fname : allfiles)
1738
+ {
1739
+ csv_file = std::ifstream (fname, std::ios::binary);
1740
+ if (!csv_file.is_open ())
1741
+ throw InputConnectorBadParamException (" cannot open file "
1742
+ + fname);
1743
+ std::string hline;
1744
+ std::getline (csv_file, hline); // skip header
1745
+
1746
+ // read on categoricals
1747
+ _cifc->fillup_categoricals (csv_file);
1748
+ _cifc->merge_categoricals (categoricals);
1749
+ }
1750
+ }
1719
1751
1720
- std::vector<double > min_vals = _cifc->_min_vals ;
1721
- std::vector<double > max_vals = _cifc->_max_vals ;
1722
- for (auto fname : reallyallfiles)
1752
+ // - read bounds across all TS CSV files
1753
+ if (_cifc->_scale
1754
+ && (_cifc->_min_vals .empty () || _cifc->_max_vals .empty ()))
1755
+ {
1756
+ std::vector<double > min_vals (_cifc->_min_vals );
1757
+ std::vector<double > max_vals (_cifc->_max_vals );
1758
+ for (auto fname : allfiles)
1723
1759
{
1724
- std::pair<std::vector< double > , std::vector< double >> mm
1725
- = _cifc-> get_min_max_vals (fname);
1726
- if (min_vals. empty ())
1727
- min_vals = mm. first ;
1728
- else
1729
- for ( size_t j = 0 ; j < mm. first . size (); j++)
1730
- min_vals. at (j) = std::min (mm. first . at (j), min_vals. at (j));
1731
- if (max_vals. empty ())
1732
- max_vals = mm. second ;
1733
- else
1734
- for ( size_t j = 0 ; j < mm. first . size (); j++)
1735
- max_vals. at (j) = std::max (mm. second . at (j) , max_vals. at (j) );
1760
+ csv_file = std::ifstream (fname , std::ios::binary);
1761
+ if (!csv_file. is_open ())
1762
+ throw InputConnectorBadParamException ( " cannot open file "
1763
+ + fname) ;
1764
+ std::string hline;
1765
+ std::getline (csv_file, hline); // skip header
1766
+
1767
+ // - read bounds min/max
1768
+ _cifc-> _min_vals . clear () ;
1769
+ _cifc-> _max_vals . clear ();
1770
+ _cifc-> find_min_max (csv_file);
1771
+ _cifc-> merge_min_max (min_vals , max_vals);
1736
1772
}
1773
+
1774
+ // - update global bounds
1737
1775
_cifc->_min_vals = min_vals;
1738
1776
_cifc->_max_vals = max_vals;
1739
1777
_cifc->serialize_bounds ();
1740
1778
}
1741
1779
1742
- if (!is_test_data && _cifc->_shuffle )
1780
+ // shuffle training data as needed
1781
+ std::vector<std::string> trainfiles_v;
1782
+ for (auto fname : trainfiles)
1783
+ trainfiles_v.push_back (fname);
1784
+ if (_cifc->_shuffle )
1743
1785
{
1744
- std::vector<std::string> allfiles_v;
1745
- for (auto fname : allfiles)
1746
- allfiles_v.push_back (fname);
1747
1786
auto rng = std::default_random_engine ();
1748
- std::shuffle (allfiles_v.begin (), allfiles_v.end (), rng);
1749
- for (auto fname : allfiles_v)
1750
- read_file (fname, is_test_data);
1787
+ std::shuffle (trainfiles_v.begin (), trainfiles_v.end (), rng);
1751
1788
}
1752
- else
1753
-
1754
- for (auto fname : allfiles)
1755
- read_file (fname, is_test_data);
1756
1789
1790
+ for (auto fname : trainfiles_v)
1791
+ read_file (fname, false );
1792
+ for (auto fname : testfiles)
1793
+ read_file (fname, true );
1794
+ _cifc->update_columns ();
1757
1795
return 0 ;
1758
1796
}
1759
1797
@@ -2034,8 +2072,7 @@ namespace dd
2034
2072
DDCCsvTS ddccsvts;
2035
2073
ddccsvts._cifc = this ;
2036
2074
ddccsvts._adconf = ad_input;
2037
- ddccsvts.read_dir (_csv_fname, false , true );
2038
- ddccsvts.read_dir (_csv_test_fname, true , false );
2075
+ ddccsvts.read_dir (_csv_fname);
2039
2076
2040
2077
_txn->Commit ();
2041
2078
_ttxn->Commit ();
0 commit comments