//| Copyright: (C) 2020-2024 Kevin Larke //| License: GNU GPL version 3.0 or above. See the accompanying LICENSE file. #ifndef cwDataSets_h #define cwDataSets_h /* Select a dataset and provide batched data/label pairs. 1. In-memory datasets, stream from disk. 2. Train/valid/test set marking. 3. K-fold rotation. 2. Conversion from source data type to batch data type. 3. One-hot encoding. 4. Shuffling. Options: 1. Read all data into memory (otherwise stream from disk -require async reading) 2. data type conversion on-load vs on-batch. 3. one-hot encoding on-load vs on-batch. 4. shuffle a. from streaming input buffer. b. in memory c. on batch Source Driver: label() // string label of this source open(cfg) // open the source close() // close the source get_info() // get the source dim and type info read(N,dst_t,dataBuf,labelBuf);// read a block of N examples and cvt to type dst_t Implementation: The only difference between streaming from disk and initial load to memory is that stream-from-disk fills a second copy of the in-memory data structure. All set marking, both RVT and K-Fold, happen on the in-memory data structure after it is populated. Shuffling happens on the in-memory data structure after it is populated. If there is no data conversion or one-hot conversion on batch output then shuffling moves elements in-memory otherwise the shuffle index vector is used as a lookup during the output step. If K-Fold segmentation is used with a streaming dataset then the k-fold index must persist between fold selection passes. */ namespace cw { namespace dataset { /* wtr: Writes columnar numeric files one row at a time. The data in a column may be multidimensional. In othe words the data in a column may be a matrix. Furthermore the data in a column may have a variable shape. Usage: 1. Use define_columns() to name and describe the shape of the data in each column. If a data has a variable size then set the variable dimension to 0. 2. For each row in the source dataset 3. For each column in the source dataset 4. Call wtr::write() to cache the column contents 5. Call write_record() to write the record to disk. Notes: a. The data type of a column is determined by the data type of the column in the first row. b. The data type of a column may not change after the first row. File Format: Offset | Field | Label -------|-------|------------------------ 4 | 0 | record_count 4 | 1 | column_count v | 0 2 | label [ cnt, c0, c1, c2 ...] 4 | 1 3 | id 4 | 2 4 | varDimN 4 | 3 5 | rankN 4 | 4 6 | maxEleN 4 | 5 7 | max typeflags v | 6 8 | max value 4 | 7 9 | min typeflags 4 | 8 10 | min value 4 | 9 11 | dimV[0] 4 | 10 12 | maxDimV[0] 4 | . | dimV[1] 4 | . | maxDimV[1] column 0 column 1 column N ---------------------- --------------------- --------------------- Row Format: { { } { } ... { } } Note that if a column's data has a fixed size then the is empty. */ namespace wtr { typedef handle handle_t; rc_t create( handle_t& h, const char* fn ); rc_t destroy( handle_t& h ); // Define the shape of each column. Set variable length dimensions to 0. rc_t define_columns( handle_t h, const char* label, unsigned columnId, unsigned rankN, const unsigned* dimV ); // Cache one column of data which will then be written on the call to write_record(). // If all the dimensions are defined in the column configuration then set dimV to nullptr; rc_t write( handle_t h, unsigned columnId, const int* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 ); rc_t write( handle_t h, unsigned columnId, const float* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 ); rc_t write( handle_t h, unsigned columnId, const double* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 ); // Write the rc_t write_record( handle_t h ); rc_t test( const object_t* cfg ); } namespace rdr { typedef handle handle_t; enum { kIntRdrFl = 0x01, kFloatRdrFl = 0x02, kDoubleRdrFl = 0x04 }; typedef struct col_str { const char* label; // Unique column label unsigned id; // Unique column id unsigned typeId; // See k???RdrFl type flags unsigned varDimN; // Count of variable sized dimensions. 0 if this is a fixed size column. unsigned rankN; // Count of elements in dimV[] unsigned* dimV; // dimV[rankN]. Dimensions with value zero are undefined and set per field. unsigned eleN; // Size of current column value unsigned* maxDimV; // maxDimV[rankN]. Maximum value for each dimension. Same as dimV[] variant::value_t max; // Max value of all data elements in this field variant::value_t min; // Min value of all data elements in this field unsigned maxEleN; // Max. count of elements in any one field. unsigned maxByteN; // Max. size of this field in bytes unsigned byteOffset; // Byte offset of the value of this field in the current record buffer. unsigned byteN; // Size of this field in bytes. } col_t; enum { kShuffleFl = 0x01 }; rc_t create( handle_t& h, const char* fn, unsigned cacheBufByteN, unsigned flags=kShuffleFl ); rc_t destroy( handle_t& h ); unsigned column_count( handle_t h ); const col_t* column_cfg( handle_t h, unsigned colIdx ); const col_t* column_cfg( handle_t h, const char* colLabel ); unsigned record_count( handle_t h); unsigned cur_record_index( handle_t h ); unsigned next_record_index( handle_t h ); enum { kOkState, // Normal state kErrorState, // An error has occurred which render the rdr unusable. kEofState // The end of the file has been encountered. }; unsigned state( handle_t h ); rc_t seek( handle_t h, unsigned recordIdx ); // Read the next record. rc_t read( handle_t h, unsigned recordIdx=kInvalidIdx ); // Get a column value from the last record returned by 'read()'. // // vRef = Pointer to the value vector. // nRef = Count of elements in value vector. // dimVRef = Dimension vector. nRef = cumprod(dimVRef) rc_t get( handle_t h, unsigned columnId, const int*& vRef, unsigned& nRef, const unsigned*& dimVRef ); rc_t get( handle_t h, unsigned columnId, const float*& vRef, unsigned& nRef, const unsigned*& dimVRef ); rc_t get( handle_t h, unsigned columnId, const double*& vRef, unsigned& nRef, const unsigned*& dimVRef ); rc_t report( handle_t h ); rc_t test( const object_t* cfg ); } namespace adapter { typedef handle handle_t; enum { kPreInitState, kInitState, kEofState, kErrorState }; enum { kTrackColDimFl = 0x01, kIntFl = 0x10, // Field Type Flags: int kFloatFl = 0x20, // float kDoubleFl = 0x40, // double kTypeMask = 0x70 // (int | float | double) }; typedef struct colMap_str { unsigned colId; // Column identifier from the rdr unsigned fieldEleOffset; // Offset into field record of this column unsigned eleN; // Count of elements in this column const unsigned* dimV; // Shape of this column unsigned rankN; // dimV[ rankN ] Rank of this column } colMap_t; rc_t create( handle_t& hRef, const char* fn, unsigned maxBatchN, unsigned cacheByteN, unsigned flags=rdr::kShuffleFl ); rc_t destroy( handle_t& hRef ); // Create a field and assign it a column. rc_t create_field( handle_t h, unsigned fieldId, unsigned flags, const char* colLabel=nullptr, bool oneHotFl=false ); // Assign an additional column to a field rc_t assign_column( handle_t h, unsigned fieldId, const char* colLabel, bool oneHotFl=false ); // Total count of records in the dataset. unsigned record_count( handle_t h ); // Field element count for fixed size fields. unsigned field_fixed_ele_count( handle_t h, unsigned fieldId ); // Read and cache batchN records. // recordIdxV[ batchN ] is an optional array of record indexes rc_t read( handle_t h, unsigned batchN, const unsigned* recordIdxV=nullptr ); // Return field vectors formed on the previous call to read(). // fV[ eleN, batchN ] // fNV[ batchN ] = eleN for each column of vV[] rc_t get( handle_t h, unsigned fieldId, const int*& fV_Ref, const unsigned*& fNV_Ref ); rc_t get( handle_t h, unsigned fieldId, const float*& fV_Ref, const unsigned*& fNV_Ref ); rc_t get( handle_t h, unsigned fieldId, const double*& fV_Ref, const unsigned*& fNV_Ref ); // Returns col position and geometry data from each record returned by the last call to read(). // Returns colMapV_Ref[batchN][columnN]. rc_t column_map( handle_t h, unsigned fieldId, colMap_t const * const *& colMapV_Ref ); // See k???State above for return values. unsigned state( handle_t h ); // Print a field to stdout. If fmt==nullptr then a format is automatically set based on the data type. rc_t print_field( handle_t h, unsigned fieldId, const char* fmt=nullptr ); rc_t test( const object_t* cfg ); } namespace mnist { typedef handle handle_t; enum { kPixelRowN = 28, kPixelColN = 28 }; rc_t create( handle_t& h, const char* inDir ); rc_t destroy( handle_t& h ); unsigned record_count( handle_t h ); rc_t seek( handle_t h, unsigned exampleIdx ); rc_t dataM( handle_t h, const float*& dataM, const unsigned*& labelV, unsigned exampleN, unsigned& actualExampleN_Ref, unsigned exampleIdx=kInvalidIdx ); rc_t write( handle_t h, const char* fn ); rc_t test( const object_t* cfg ); } rc_t test( const object_t* cfg ); } } #endif