libcw/cwDataSets.h

305 lines
11 KiB
C
Raw Permalink Normal View History

//| Copyright: (C) 2020-2024 Kevin Larke <contact AT larke DOT org>
//| License: GNU GPL version 3.0 or above. See the accompanying LICENSE file.
#ifndef cwDataSets_h
#define cwDataSets_h
2020-10-30 13:40:39 +00:00
/*
2020-10-30 13:40:39 +00:00
Select a dataset and provide batched data/label pairs.
1. In-memory datasets, stream from disk.
2. Train/valid/test set marking.
3. K-fold rotation.
2. Conversion from source data type to batch data type.
3. One-hot encoding.
4. Shuffling.
Options:
1. Read all data into memory (otherwise stream from disk -require async reading)
2. data type conversion on-load vs on-batch.
3. one-hot encoding on-load vs on-batch.
4. shuffle
a. from streaming input buffer.
b. in memory
c. on batch
Source Driver:
label() // string label of this source
open(cfg) // open the source
close() // close the source
get_info() // get the source dim and type info
read(N,dst_t,dataBuf,labelBuf);// read a block of N examples and cvt to type dst_t
Implementation:
The only difference between streaming from disk and initial load to memory is that
stream-from-disk fills a second copy of the in-memory data structure.
All set marking, both RVT and K-Fold, happen on the in-memory data structure after it is populated.
Shuffling happens on the in-memory data structure after it is populated.
If there is no data conversion or one-hot conversion on batch output then shuffling moves elements in-memory otherwise
the shuffle index vector is used as a lookup during the output step.
If K-Fold segmentation is used with a streaming dataset then the k-fold index must persist
between fold selection passes.
*/
namespace cw
{
namespace dataset
{
/*
wtr: Writes columnar numeric files one row at a time. The data in a column
may be multidimensional. In othe words the data in a column may be a matrix.
Furthermore the data in a column may have a variable shape.
Usage:
1. Use define_columns() to name and describe the shape of the data in each column.
If a data has a variable size then set the variable dimension to 0.
2. For each row in the source dataset
3. For each column in the source dataset
4. Call wtr::write() to cache the column contents
5. Call write_record() to write the record to disk.
Notes:
a. The data type of a column is determined by the data type of the column in the first row.
b. The data type of a column may not change after the first row.
File Format:
Offset | Field | Label
-------|-------|------------------------
4 | 0 | record_count
4 | 1 | column_count
v | 0 2 | label [ cnt, c0, c1, c2 ...]
4 | 1 3 | id
4 | 2 4 | varDimN
4 | 3 5 | rankN
4 | 4 6 | maxEleN
4 | 5 7 | max typeflags
v | 6 8 | max value
4 | 7 9 | min typeflags
4 | 8 10 | min value
4 | 9 11 | dimV[0]
4 | 10 12 | maxDimV[0]
4 | . | dimV[1]
4 | . | maxDimV[1]
column 0 column 1 column N
---------------------- --------------------- ---------------------
Row Format: { <row_byte_count> { <varDimV0> <data0> } { <varDimV1> <data1> } ... { <varDimVN> <dataN> } }
Note that if a column's data has a fixed size then the <varDimV> is empty.
*/
namespace wtr
{
typedef handle<struct wtr_str> handle_t;
rc_t create( handle_t& h, const char* fn );
rc_t destroy( handle_t& h );
// Define the shape of each column. Set variable length dimensions to 0.
rc_t define_columns( handle_t h, const char* label, unsigned columnId, unsigned rankN, const unsigned* dimV );
// Cache one column of data which will then be written on the call to write_record().
// If all the dimensions are defined in the column configuration then set dimV to nullptr;
rc_t write( handle_t h, unsigned columnId, const int* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 );
rc_t write( handle_t h, unsigned columnId, const float* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 );
rc_t write( handle_t h, unsigned columnId, const double* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 );
// Write the
rc_t write_record( handle_t h );
rc_t test( const object_t* cfg );
}
namespace rdr
{
typedef handle<struct rdr_str> handle_t;
enum
{
kIntRdrFl = 0x01,
kFloatRdrFl = 0x02,
kDoubleRdrFl = 0x04
};
typedef struct col_str
{
const char* label; // Unique column label
unsigned id; // Unique column id
unsigned typeId; // See k???RdrFl type flags
unsigned varDimN; // Count of variable sized dimensions. 0 if this is a fixed size column.
unsigned rankN; // Count of elements in dimV[]
unsigned* dimV; // dimV[rankN]. Dimensions with value zero are undefined and set per field.
unsigned eleN; // Size of current column value
unsigned* maxDimV; // maxDimV[rankN]. Maximum value for each dimension. Same as dimV[]
variant::value_t max; // Max value of all data elements in this field
variant::value_t min; // Min value of all data elements in this field
unsigned maxEleN; // Max. count of elements in any one field.
unsigned maxByteN; // Max. size of this field in bytes
unsigned byteOffset; // Byte offset of the value of this field in the current record buffer.
unsigned byteN; // Size of this field in bytes.
} col_t;
enum {
kShuffleFl = 0x01
};
rc_t create( handle_t& h, const char* fn, unsigned cacheBufByteN, unsigned flags=kShuffleFl );
rc_t destroy( handle_t& h );
unsigned column_count( handle_t h );
const col_t* column_cfg( handle_t h, unsigned colIdx );
const col_t* column_cfg( handle_t h, const char* colLabel );
unsigned record_count( handle_t h);
unsigned cur_record_index( handle_t h );
unsigned next_record_index( handle_t h );
enum {
kOkState, // Normal state
kErrorState, // An error has occurred which render the rdr unusable.
kEofState // The end of the file has been encountered.
};
unsigned state( handle_t h );
rc_t seek( handle_t h, unsigned recordIdx );
// Read the next record.
rc_t read( handle_t h, unsigned recordIdx=kInvalidIdx );
// Get a column value from the last record returned by 'read()'.
//
// vRef = Pointer to the value vector.
// nRef = Count of elements in value vector.
// dimVRef = Dimension vector. nRef = cumprod(dimVRef)
rc_t get( handle_t h, unsigned columnId, const int*& vRef, unsigned& nRef, const unsigned*& dimVRef );
rc_t get( handle_t h, unsigned columnId, const float*& vRef, unsigned& nRef, const unsigned*& dimVRef );
rc_t get( handle_t h, unsigned columnId, const double*& vRef, unsigned& nRef, const unsigned*& dimVRef );
rc_t report( handle_t h );
rc_t test( const object_t* cfg );
2020-10-30 13:40:39 +00:00
}
namespace adapter
{
typedef handle<struct adapter_str> handle_t;
enum {
kPreInitState,
kInitState,
kEofState,
kErrorState
};
enum {
kTrackColDimFl = 0x01,
kIntFl = 0x10, // Field Type Flags: int
kFloatFl = 0x20, // float
kDoubleFl = 0x40, // double
kTypeMask = 0x70 // (int | float | double)
};
2020-10-30 13:40:39 +00:00
typedef struct colMap_str
{
unsigned colId; // Column identifier from the rdr
unsigned fieldEleOffset; // Offset into field record of this column
unsigned eleN; // Count of elements in this column
const unsigned* dimV; // Shape of this column
unsigned rankN; // dimV[ rankN ] Rank of this column
} colMap_t;
rc_t create( handle_t& hRef, const char* fn, unsigned maxBatchN, unsigned cacheByteN, unsigned flags=rdr::kShuffleFl );
rc_t destroy( handle_t& hRef );
2020-10-30 13:40:39 +00:00
// Create a field and assign it a column.
rc_t create_field( handle_t h, unsigned fieldId, unsigned flags, const char* colLabel=nullptr, bool oneHotFl=false );
2020-10-30 13:40:39 +00:00
// Assign an additional column to a field
rc_t assign_column( handle_t h, unsigned fieldId, const char* colLabel, bool oneHotFl=false );
// Total count of records in the dataset.
unsigned record_count( handle_t h );
// Field element count for fixed size fields.
unsigned field_fixed_ele_count( handle_t h, unsigned fieldId );
2020-10-30 13:40:39 +00:00
// Read and cache batchN records.
// recordIdxV[ batchN ] is an optional array of record indexes
rc_t read( handle_t h, unsigned batchN, const unsigned* recordIdxV=nullptr );
2020-10-30 13:40:39 +00:00
// Return field vectors formed on the previous call to read().
// fV[ eleN, batchN ]
// fNV[ batchN ] = eleN for each column of vV[]
rc_t get( handle_t h, unsigned fieldId, const int*& fV_Ref, const unsigned*& fNV_Ref );
rc_t get( handle_t h, unsigned fieldId, const float*& fV_Ref, const unsigned*& fNV_Ref );
rc_t get( handle_t h, unsigned fieldId, const double*& fV_Ref, const unsigned*& fNV_Ref );
2020-10-30 13:40:39 +00:00
// Returns col position and geometry data from each record returned by the last call to read().
// Returns colMapV_Ref[batchN][columnN].
rc_t column_map( handle_t h, unsigned fieldId, colMap_t const * const *& colMapV_Ref );
// See k???State above for return values.
unsigned state( handle_t h );
// Print a field to stdout. If fmt==nullptr then a format is automatically set based on the data type.
rc_t print_field( handle_t h, unsigned fieldId, const char* fmt=nullptr );
rc_t test( const object_t* cfg );
}
2020-10-30 13:40:39 +00:00
namespace mnist
{
typedef handle<struct mnist_str> handle_t;
enum {
kPixelRowN = 28,
kPixelColN = 28
};
rc_t create( handle_t& h, const char* inDir );
rc_t destroy( handle_t& h );
unsigned record_count( handle_t h );
rc_t seek( handle_t h, unsigned exampleIdx );
rc_t dataM( handle_t h, const float*& dataM, const unsigned*& labelV, unsigned exampleN, unsigned& actualExampleN_Ref, unsigned exampleIdx=kInvalidIdx );
2020-10-30 13:40:39 +00:00
rc_t write( handle_t h, const char* fn );
2020-10-30 13:40:39 +00:00
rc_t test( const object_t* cfg );
}
2020-10-30 13:40:39 +00:00
rc_t test( const object_t* cfg );
}
}
#endif