diff --git a/cwDataSets.cpp b/cwDataSets.cpp index 681c4d6..30668ff 100644 --- a/cwDataSets.cpp +++ b/cwDataSets.cpp @@ -7,9 +7,1965 @@ #include "cwFileSys.h" #include "cwVectOps.h" #include "cwMtx.h" +#include "cwVariant.h" #include "cwDataSets.h" #include "cwSvg.h" #include "cwTime.h" +#include "cwText.h" + + +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- + +namespace cw +{ + namespace dataset + { + namespace wtr + { + typedef struct col_str + { + rdr::col_t col; // Public fields. + unsigned char* cur; // Cache of the current column data contents. + unsigned curByteN; // Count of bytes in cur[]. + unsigned* curDimV; // Cache of the current column dimensions. + struct col_str* link; // Link to next col_t record. + } col_t; + + typedef struct wtr_str + { + file::handle_t fH; // Output file handle + unsigned record_count; // Total count of rows. + col_t* colL; // Linked list of column descriptions + unsigned totalVarDimN; // Total count of unknown dim's among all columns + } wtr_t; + + inline wtr_t* _handleToPtr( handle_t h ) + { return handleToPtr(h); } + + rc_t _destroy( wtr_t* p ) + { + col_t* c = p->colL; + while( c != nullptr ) + { + col_t* c0 = c->link; + mem::free(const_cast(c->col.label)); + mem::release(c->col.dimV); + mem::release(c->col.maxDimV); + mem::release(c->cur); + mem::release(c->curDimV); + mem::release(c); + c = c0; + } + + file::close(p->fH); + mem::release(p); + + return kOkRC; + } + + col_t* _columnIdToPtr( wtr_t* p, unsigned columnId ) + { + col_t* c = p->colL; + for(; c!=nullptr; c=c->link ) + if( c->col.id == columnId ) + return c; + + cwLogError(kInvalidArgRC,"The dataset column id %i was not found.",columnId); + return nullptr; + + } + + // eleN = count of elements in dV[] + // dimV[ dimN ] = dimensions for variable sized data elements. cumprod(dimV) must equal eleN + rc_t _write_column_to_buf( wtr_t* p, unsigned columnId, unsigned eleN, const unsigned* dimV, unsigned dimN, const void* dV, unsigned typeFlags, col_t*& colPtrRef ) + { + col_t* c = _columnIdToPtr(p,columnId); + + if( c == nullptr ) + return cwLogError(kInvalidArgRC,"Unable to locate the column description associated with id: %i.",columnId); + + // if this is a fixed size column + if( c->col.varDimN == 0 ) + { + // verify that the element count matches the fixed element count + if( eleN != c->col.maxEleN ) + return cwLogError(kInvalidArgRC,"Data vector in column '%s' has %i elements but should have %i elements.", cwStringNullGuard(c->col.label), eleN, c->col.maxEleN ); + + if( dimV != nullptr || dimN != 0 ) + cwLogWarning("The dimension vector for the fixed sized column '%s' is ignored in the write() function.",cwStringNullGuard(c->col.label)); + + } + else // this is a variable sized column + { + unsigned tmpEleN = 1; + for(unsigned i=0; icol.rankN; ++i) + { + tmpEleN *= dimV[i]; // track the count of elements + c->col.maxDimV[i] = std::max(c->col.maxDimV[i], dimV[i] ); // track the max. dimension + c->curDimV[i] = dimV[i]; // store the this columns dimensions + } + + // verify that the sizeof the data matches the size given in the dimensions + if( tmpEleN != eleN ) + return cwLogError(kInvalidArgRC,"The product of the dimension vector does not equal the count of elements in column '%s'.",c->col.label); + + } + + if( p->record_count == 0) + { + // set data type + c->col.max.flags = typeFlags; + c->col.min.flags = typeFlags; + } + else + { + // verify data type is the same for all elements + if( c->col.max.flags != typeFlags ) + return cwLogError(kInvalidArgRC,"The data vector type '%s' does not match the column type '%s'.", variant::flagsToLabel(typeFlags), variant::flagsToLabel(c->col.max.flags)); + + } + + // store the bytes associated with col/row + unsigned bytesPerEle = variant::flagsToBytes(typeFlags); + + + if( bytesPerEle == 0 ) + return cwLogError(kInvalidArgRC,"Invalid type identifier in column '%s'.", cwStringNullGuard(c->col.label)); + else + { + c->curByteN = bytesPerEle * eleN; + c->cur = mem::resize(c->cur,c->curByteN); + memcpy(c->cur,dV,c->curByteN); + } + + colPtrRef = c; + + return kOkRC; + } + + + rc_t _write_hdr( wtr_t* p ) + { + col_t* c; + rc_t rc; + + p->totalVarDimN = 0; + + // get the count of columns + unsigned col_count = 0; + for(c=p->colL; c!=nullptr; c=c->link) + ++col_count; + + if((rc = file::write( p->fH, p->record_count )) != kOkRC ) goto errLabel; + if((rc = file::write( p->fH, col_count )) != kOkRC ) goto errLabel; + + for(c=p->colL; c!=nullptr; c=c->link) + { + if((rc = file::writeStr( p->fH, c->col.label )) != kOkRC ) goto errLabel; + if((rc = file::write( p->fH, c->col.id )) != kOkRC ) goto errLabel; + if((rc = file::write( p->fH, c->col.varDimN )) != kOkRC ) goto errLabel; + if((rc = file::write( p->fH, c->col.rankN )) != kOkRC ) goto errLabel; + if((rc = file::write( p->fH, c->col.maxEleN )) != kOkRC ) goto errLabel; + if((rc = variant::write( p->fH, c->col.max)) != kOkRC ) goto errLabel; + if((rc = variant::write( p->fH, c->col.min )) != kOkRC ) goto errLabel; + + for(unsigned i=0; icol.rankN; ++i) + { + if((rc = file::write( p->fH, c->col.dimV[i] )) != kOkRC ) goto errLabel; + if((rc = file::write( p->fH, c->col.maxDimV[i])) != kOkRC ) goto errLabel; + } + + p->totalVarDimN += c->col.varDimN; + + } + + errLabel: + return rc; + } + + rc_t _re_write_hdr( wtr_t* p ) + { + rc_t rc; + if((rc = file::seek( p->fH, file::kBeginFl, 0)) != kOkRC ) + return cwLogError( kSeekFailRC, "Data file Header seek failed."); + + if((rc = _write_hdr( p )) != kOkRC ) + return cwLogError( rc, "Header re-write failed."); + + return rc; + } + } + } +} + +cw::rc_t cw::dataset::wtr::create( handle_t& h, const char* fn ) +{ + rc_t rc; + if((rc = destroy(h)) != kOkRC ) + return rc; + + auto p = mem::allocZ(1); + + if((rc = file::open(p->fH,fn,file::kWriteFl)) != kOkRC ) + { + rc = cwLogError(rc,"Data file creation failed."); + goto errLabel; + } + + h.set(p); + + errLabel: + if(rc != kOkRC ) + _destroy(p); + + return rc; +} + +cw::rc_t cw::dataset::wtr::destroy( handle_t& h ) +{ + rc_t rc = kOkRC; + + if( !h.isValid()) + return rc; + + wtr_t* p = _handleToPtr(h); + + if(( rc = _re_write_hdr( p )) != kOkRC ) + return rc; + + if((rc = _destroy(p)) != kOkRC ) + return rc; + + h.clear(); + + return rc; +} + +cw::rc_t cw::dataset::wtr::define_columns( handle_t h, const char* label, unsigned columnId, unsigned rankN, const unsigned* dimV ) +{ + rc_t rc = kOkRC; + wtr_t* p = _handleToPtr(h); + col_t* c = mem::allocZ(1); + c->col.label = mem::duplStr(label); + c->col.id = columnId; + c->col.rankN = rankN; + c->col.varDimN = 0; + c->col.dimV = mem::allocDupl( dimV, rankN ); + c->col.maxDimV = mem::allocDupl( dimV, rankN ); + c->curDimV = mem::allocDupl( dimV, rankN ); + c->col.maxEleN = 1; + + for(unsigned i=0; icol.maxEleN *= dimV[i]; + + if( dimV[i] == 0 ) + { + c->col.varDimN +=1; + } + } + + // link the new col recd to the end of the column list + col_t* c0 = p->colL; + col_t* c1 = nullptr; + for(; c0!=nullptr; c0=c0->link) + c1 = c0; + + if( c1 == nullptr ) + p->colL = c; + else + c1->link = c; + + return rc; +} + +cw::rc_t cw::dataset::wtr::write( handle_t h, unsigned columnId, const int* dV, unsigned eleN, const unsigned* dimV, unsigned rankN ) +{ + rc_t rc; + wtr_t* p = _handleToPtr(h); + col_t* c = nullptr; + + if((rc = _write_column_to_buf( p, columnId, eleN, dimV, rankN, dV, variant::kInt32VFl, c)) != kOkRC ) + return rc; + + if( p->record_count == 0 ) + { + c->col.min.u.i32 = vop::min( dV, eleN ); + c->col.max.u.i32 = vop::max( dV, eleN ); + //printf("0i %i %i\n", columnId, c->col.min.u.i32 ); + } + else + { + //printf("1i %i %i\n", columnId, c->col.min.u.i32 ); + c->col.min.u.i32 = std::min(c->col.min.u.i32, vop::min( dV, eleN )); + c->col.max.u.i32 = std::max(c->col.max.u.i32, vop::max( dV, eleN )); + } + + return rc; +} + +cw::rc_t cw::dataset::wtr::write( handle_t h, unsigned columnId, const float* dV, unsigned eleN, const unsigned* dimV, unsigned rankN ) +{ + rc_t rc; + wtr_t* p = _handleToPtr(h); + col_t* c = nullptr; + + if((rc = _write_column_to_buf( p, columnId, eleN, dimV, rankN, dV, variant::kFloatVFl, c)) != kOkRC ) + return rc; + + if( p->record_count == 0 ) + { + c->col.min.u.f = vop::min( dV, eleN ); + c->col.max.u.f = vop::max( dV, eleN ); + } + else + { + c->col.min.u.f = std::min(c->col.min.u.f, vop::min( dV, eleN )); + c->col.max.u.f = std::max(c->col.max.u.f, vop::max( dV, eleN )); + } + + return rc; +} + +cw::rc_t cw::dataset::wtr::write( handle_t h, unsigned columnId, const double* dV, unsigned eleN, const unsigned* dimV, unsigned rankN ) +{ + rc_t rc; + wtr_t* p = _handleToPtr(h); + col_t* c = nullptr; + + if((rc = _write_column_to_buf( p, columnId, eleN, dimV, rankN, dV, variant::kDoubleVFl, c)) != kOkRC ) + return rc; + + if( p->record_count == 0 ) + { + c->col.min.u.d = vop::min( dV, eleN ); + c->col.max.u.d = vop::max( dV, eleN ); + } + else + { + c->col.min.u.d = std::min(c->col.min.u.d, vop::min( dV, eleN )); + c->col.max.u.d = std::max(c->col.max.u.d, vop::max( dV, eleN )); + } + + return rc; +} + +cw::rc_t cw::dataset::wtr::write_record( handle_t h ) +{ + rc_t rc; + wtr_t* p = _handleToPtr(h); + col_t* c; + + // if this is the first row in the file then write the file header + if( p->record_count == 0 ) + if((rc = _write_hdr(p)) != kOkRC ) + return rc; + + unsigned rowByteN = 0; + + // calculate the size of the row data + for(c=p->colL; c!=nullptr; c=c->link) + rowByteN += c->col.varDimN * sizeof(unsigned) + c->curByteN; + + // write the size of this row + if((rc = file::write(p->fH, rowByteN)) != kOkRC ) + goto errLabel; + + // for each column + for(c=p->colL; c!=nullptr; c=c->link) + { + // if this is a variable sized column + if( c->col.varDimN > 0 ) + { + // then write the variable sized dimensions + for(unsigned i=0; icol.rankN; ++i) + if( c->col.dimV[i] == 0 ) + if((rc = file::write( p->fH, c->curDimV[i] )) != kOkRC ) + goto errLabel; + } + + + // write the column field value + if((rc = file::write( p->fH, c->cur, c->curByteN)) != kOkRC ) + goto errLabel; + + } + + errLabel: + if( rc != kOkRC ) + rc = cwLogError(rc,"Example index %i write failed", p->record_count); + else + p->record_count += 1; + + return rc; +} + +/* + +File Format for the following data. +where the data record itself is repeated 3 time. + +unsigned dim0V[] = {1}; +unsigned dim1V[] = {3}; +unsigned dim2V[] = {2,0}; +unsigned dim3V[] = {2,2}; + +int val0[] = {0}; +int val1[] = {1,2,3}; +int val2[] = {4,5,6,7,8,9}; +int val3[] = {10,11,13,14}; + +0300 0000 3 recd_count +0400 0000 4 col_count + +0400 0000 label size - col0 +636f 6c30 label +0000 0000 id +0000 0000 varDimN +0100 0000 rankN +0100 0000 maxEleN +4000 0000 max type +0000 0000 max value +0000 0000 +4000 0000 min type +0000 0000 min value +0000 0000 +0100 0000 dimV[0] +0100 0000 maxDimV[0] + +0400 0000 label size - col 1 +636f 6c31 label +0100 0000 id +0000 0000 varDimN +0100 0000 rankN +0300 0000 maxEleN +4000 0000 max type +0300 0000 max value +0000 0000 +4000 0000 min type +0100 0000 max value +0000 0000 +0300 0000 dimV[0] +0300 0000 maxDimV[0] + +0400 0000 label size - col 2 +636f 6c32 label +0200 0000 id +0100 0000 varDimN +0200 0000 rankN +0000 0000 maxEleN +4000 0000 max type +0900 0000 max value +0000 0000 +4000 0000 min type +0400 0000 min value +0000 0000 +0200 0000 dimV[0] +0200 0000 maxDimV[0] + +0000 0000 dimV[1] +0300 0000 maxDimV[1] + +0400 0000 label size - col 3 +636f 6c33 label +0300 0000 id +0000 0000 varDimN +0200 0000 rankN +0400 0000 maxEleN +4000 0000 max type +0e00 0000 max value +0000 0000 +4000 0000 min type +0a00 0000 min value +0000 0000 +0200 0000 dimV[0] +0200 0000 maxDimV[0] +0200 0000 dimV[1] +0200 0000 maxDimV[1] + +3c00 0000 recd0 size (60 bytes) +0000 0000 0 col0 + +0100 0000 1 col1[0] +0200 0000 2 col1[1] +0300 0000 3 col1[2] + +0300 0000 dimV[1] col2 <- variable dimension +0400 0000 4 col2[0] +0500 0000 5 +0600 0000 6 +0700 0000 7 +0800 0000 8 +0900 0000 9 + +0a00 0000 10 col3 +0b00 0000 11 +0d00 0000 12 +0e00 0000 13 + +3c00 0000 recd1 size (60 bytes) +0100 0000 1 col0 + +0100 0000 +0200 0000 +0300 0000 + +0300 0000 +0400 0000 +0500 0000 +0600 0000 +0700 0000 +0800 0000 +0900 0000 + +0a00 0000 +0b00 0000 +0d00 0000 +0e00 0000 + +3c00 0000 recd2 size (60 bytes) +0200 0000 2 col0 + +0100 0000 +0200 0000 +0300 0000 + +0300 0000 +0400 0000 +0500 0000 +0600 0000 +0700 0000 +0800 0000 +0900 0000 + +0a00 0000 +0b00 0000 +0d00 0000 +0e00 0000 + */ +cw::rc_t cw::dataset::wtr::test( const object_t* cfg ) +{ + rc_t rc = kOkRC; + char* outFn = nullptr; + handle_t h; + + if((rc = cfg->getv("outFn",outFn)) != kOkRC ) + return cwLogError(rc,"wtr test failed. Argument parse failed."); + + outFn = filesys::expandPath(outFn); + + if((rc = create(h,outFn)) != kOkRC ) + { + rc = cwLogError(rc,"rdr create failed."); + goto errLabel; + } + else + { + enum { kId0, kId1, kId2, kId3 }; + unsigned dim0V[] = {1}; + unsigned dim1V[] = {3}; + unsigned dim2V[] = {2,0}; + unsigned dim3V[] = {2,2}; + + unsigned dim0N = cwCountOf(dim0V); + unsigned dim1N = cwCountOf(dim1V); + unsigned dim2N = cwCountOf(dim2V); + unsigned dim3N = cwCountOf(dim3V); + + int val0[] = {0}; + int val1[] = {1,2,3}; + int val2[] = {4,5,6,7,8,9}; + int val3[] = {10,11,13,14}; + + if((rc = define_columns(h, "col0", kId0, dim0N, dim0V )) != kOkRC ) + { + rc = cwLogError(rc,"Define column 0 failed."); + goto errLabel; + } + + if((rc = define_columns(h, "col1", kId1, dim1N, dim1V )) != kOkRC ) + { + rc = cwLogError(rc,"Define column 1 failed."); + goto errLabel; + } + + if((rc = define_columns(h, "col2", kId2, dim2N, dim2V )) != kOkRC ) + { + rc = cwLogError(rc,"Define column 2 failed."); + goto errLabel; + } + + if((rc = define_columns(h, "col3", kId3, dim3N, dim3V )) != kOkRC ) + { + rc = cwLogError(rc,"Define column 3 failed."); + goto errLabel; + } + + for(unsigned i=0; i<3; ++i) + { + + val0[0] = i; + + write( h, kId0, val0, dim0V[0] ); + write( h, kId1, val1, dim1V[0] ); + + dim2V[1] = 3; + write( h, kId2, val2, dim2V[0]*dim2V[1], dim2V, dim2N ); + write( h, kId3, val3, dim3V[0]*dim3V[1] ); + write_record(h); + } + + } + + errLabel: + destroy(h); + mem::release(outFn); + + return rc; +} + +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- +namespace cw +{ + namespace dataset + { + namespace rdr + { + enum + { + kSizeofRecordHeader = sizeof(unsigned) + }; + + typedef struct + { + col_t col; // Public record + unsigned* varDimIdxV; // varDimIdxV[] Dimension indexes that are variable in this column. + unsigned varDimIdxN; // Count of values in varDimIdxV[]. + } c_t; + + typedef struct rdr_str + { + c_t* colA; // colA[ column_count ] Per column data. + unsigned column_count; // Count of elements in colA[]. + unsigned record_count; // Count of total examples. + file::handle_t fH; // Backing data file handle. + std::uint8_t* buf; // buf[ bufMaxByteN ] File read buffer + unsigned bufMaxByteN; // Allocated size of buf[] in bytes. (also sizeof fixed size records) + unsigned bufCurByteN; // Current count of bytes used in buf[]. + bool isFixedSizeFl; // True if all fields are fixed size + + unsigned curRecordIdx; // Index of record in buf[]. + unsigned nextRecordIdx; // Index of the next record to read. + long baseFileByteOffs; // File byte offset of the first data record + + unsigned state; // See k???State enum + + + } rdr_t; + + typedef struct type_str + { + const char* label; + unsigned typeId; + unsigned variantFl; + } type_t; + + type_t _typeRefA[] = { + { "int", kIntRdrFl, variant::kInt32VFl }, + { "float", kFloatRdrFl, variant::kFloatVFl }, + { "double", kDoubleRdrFl, variant::kDoubleVFl }, + { nullptr, 0, 0 } + }; + + rdr_t* _handleToPtr(handle_t h ) + { return handleToPtr(h); } + + + const type_t* _typeIdToDesc( unsigned typeId ) + { + for(const type_t* t=_typeRefA; t->label!=nullptr; ++t) + if( t->typeId == typeId ) + return t; + + cwLogError(kInvalidArgRC,"The dataset rdr typeId %i is not valid.", typeId); + return nullptr; + } + + const type_t* _varTypeFlagsToDesc( unsigned variantFl ) + { + for(const type_t* t=_typeRefA; t->label!=nullptr; ++t) + if( t->variantFl == variantFl ) + return t; + + return nullptr; + } + + const char* _typeIdToLabel( unsigned typeId ) + { + const type_t* t; + if((t = _typeIdToDesc(typeId)) == nullptr ) + return nullptr; + return t->label; + } + + bool _typeIdMatch( unsigned typeId, unsigned variantTypeFl ) + { + const type_t* t; + + if((t = _typeIdToDesc(typeId)) == nullptr ) + return false; + + return t->typeId==typeId && t->variantFl==variantTypeFl; + } + + const c_t* _colFromId( rdr_t* p, unsigned columnId ) + { + for(unsigned i=0; icolumn_count; ++i) + if( p->colA[i].col.id == columnId ) + return p->colA + i; + + cwLogError(kInvalidArgRC,"Invalid columnId (%i).", columnId ); + return nullptr; + } + + const c_t* _colFromLabel( rdr_t* p, const char* colLabel ) + { + for(unsigned i=0; icolumn_count; ++i) + if( textCompare(p->colA[i].col.label, colLabel) == 0 ) + return p->colA + i; + + cwLogError(kInvalidArgRC,"Invalid column label:%s.", colLabel ); + return nullptr; + } + + rc_t _destroy( rdr_t* p ) + { + for(unsigned i=0; icolumn_count; ++i) + { + mem::release( p->colA[i].col.dimV ); + mem::release( p->colA[i].col.maxDimV ); + mem::release( p->colA[i].varDimIdxV); + mem::free( const_cast(p->colA[i].col.label) ); + } + + file::close(p->fH); + mem::release(p->colA); + mem::release(p->buf); + mem::release(p); + + return kOkRC; + } + + rc_t _readHdr( rdr_t* p ) + { + rc_t rc = kOkRC; + unsigned bufOffsByteN = 0; + + p->bufMaxByteN = 0; + p->isFixedSizeFl = true; + + if((rc = read(p->fH,p->record_count)) != kOkRC ) goto errLabel; + if((rc = read(p->fH,p->column_count)) != kOkRC ) goto errLabel; + + p->colA = mem::allocZ( p->column_count); + + // for each column + for(unsigned i=0; icolumn_count; ++i) + { + c_t* c = p->colA + i; + + if((rc = readStr( p->fH,(char**)&c->col.label,255)) != kOkRC ) goto errLabel; + if((rc = read(p->fH,c->col.id)) != kOkRC ) goto errLabel; + if((rc = read(p->fH,c->col.varDimN)) != kOkRC ) goto errLabel; + if((rc = read(p->fH,c->col.rankN )) != kOkRC ) goto errLabel; + if((rc = read(p->fH,c->col.maxEleN )) != kOkRC ) goto errLabel; + if((rc = variant::read( p->fH, c->col.max)) != kOkRC ) goto errLabel; + if((rc = variant::read( p->fH, c->col.min )) != kOkRC ) goto errLabel; + + + c->col.dimV = mem::allocZ( c->col.rankN ); + c->col.maxDimV = mem::allocZ( c->col.rankN ); + c->varDimIdxV = mem::allocZ( c->col.rankN ); + c->varDimIdxN = 0; + + c->col.maxEleN = c->col.rankN==0 ? 0 : 1; + + for(unsigned j=0; jcol.rankN; ++j) + { + if((rc = file::read( p->fH, c->col.dimV[j] )) != kOkRC ) goto errLabel; + if((rc = file::read( p->fH, c->col.maxDimV[j])) != kOkRC ) goto errLabel; + + if( c->col.dimV[j] == 0 ) + c->varDimIdxV[c->varDimIdxN++] = j; + + c->col.maxEleN *= c->col.maxDimV[j]; + } + + unsigned bytesPerEle = variant::flagsToBytes(c->col.max.flags); + + const type_t* t; + if((t = _varTypeFlagsToDesc(c->col.max.flags)) == nullptr ) + rc = cwLogError(kInvalidDataTypeRC,"The column %s is not a valid data type (e.g. int, float double).",cwStringNullGuard(c->col.label)); + else + c->col.typeId = t->typeId; + + // TODO: why maintain both eleN and maxEleN and byteN and maxByteN? + c->col.eleN = c->col.maxEleN; + c->col.maxByteN = bytesPerEle * c->col.maxEleN; + c->col.byteOffset = bufOffsByteN; + c->col.byteN = c->col.maxByteN; + + p->bufMaxByteN += c->col.maxByteN + c->varDimIdxN * sizeof(unsigned); // Track the max file buffer size + + if( c->col.varDimN != 0 && p->isFixedSizeFl ) + p->isFixedSizeFl = false; + + bufOffsByteN = p->bufMaxByteN; + } + + p->buf = mem::alloc(p->bufMaxByteN); + + // store the file offset to the first data record + rc = tell(p->fH,&p->baseFileByteOffs); + + errLabel: + if( rc != kOkRC ) + { + rc = cwLogError(rc,"Data set file header read failed."); + p->state = kErrorState; + } + + return rc; + } + + rc_t _rewind( rdr_t* p ) + { + rc_t rc; + if((rc = file::seek( p->fH, file::kBeginFl, p->baseFileByteOffs)) != kOkRC ) + p->state = kErrorState; + else + { + p->curRecordIdx = kInvalidIdx; + p->nextRecordIdx = 0; + } + return rc; + } + + rc_t _var_seek( rdr_t* p, unsigned recdIdx ) + { + rc_t rc = kOkRC; + + if( recdIdx < p->nextRecordIdx ) + if((rc = _rewind(p)) != kOkRC ) + goto errLabel; + + for(; recdIdx < p->nextRecordIdx; ++recdIdx ) + { + unsigned recdByteN; + if((rc = file::read(p->fH,recdByteN)) != kOkRC ) + { + p->state = kErrorState; + goto errLabel; + } + + if((rc = file::seek(p->fH, file::kCurFl, recdByteN )) != kOkRC ) + { + p->state = kErrorState; + goto errLabel; + } + } + + errLabel: + return rc; + } + + // Seek to the a record, but don't actually read it. + rc_t _seek( rdr_t* p, unsigned recdIdx ) + { + rc_t rc = kOkRC; + + if( p->nextRecordIdx == recdIdx ) + return rc; + + if( recdIdx >= p->record_count ) + { + rc = cwLogError(kInvalidArgRC,"The seek index %i is invalid. Record Count=%i", recdIdx, p->record_count); + goto errLabel; + } + + if( p->isFixedSizeFl ) + rc = _var_seek(p,recdIdx); + else + { + // fixed size recds offset = baseOffset + (recd_index * (sizeof(recd_byte_cnt) + sizeof(data_record))) + rc = file::seek( p->fH, file::kBeginFl, p->baseFileByteOffs + recdIdx * (kSizeofRecordHeader + p->bufMaxByteN)); + } + + if( rc == kOkRC ) + p->nextRecordIdx = recdIdx; + + errLabel: + return rc; + + } + + rc_t _parse_var_record( rdr_t* p ) + { + rc_t rc = kOkRC; + + p->bufCurByteN = 0; + + for(unsigned i=0; icolumn_count; ++i) + { + c_t* c = p->colA + i; + + // if this is a variabled sized column + if( c->col.varDimN != 0 ) + { + unsigned* varDimV = reinterpret_cast(p->buf + p->bufCurByteN ); + unsigned eleN = c->col.rankN==0 ? 0 : 1; + + // for each dim. of this column + for(unsigned j=0,k=0; jcol.rankN; ++j) + { + // if this is a variable sized dimension then set the actual dim. size + if( kvarDimIdxN && c->varDimIdxV[k] == j ) + { + c->col.dimV[j] = varDimV[k]; + k += 1; + p->bufCurByteN += sizeof(varDimV[k]); + } + + // calc the count of elements in this field + eleN *= c->col.dimV[j]; + } + + // set the size and count of elements in this field + c->col.eleN = eleN; + c->col.byteN = variant::flagsToBytes( c->col.max.flags ) * eleN; + } + + c->col.byteOffset = p->bufCurByteN; + p->bufCurByteN += c->col.byteN; + + } + return rc; + } + + + rc_t _read_record( rdr_t* p ) + { + rc_t rc = kOkRC; + + unsigned recordByteN; + + // Read the byte length of this record + if((rc = file::read(p->fH, recordByteN )) != kOkRC ) + { + if( file::eof(p->fH) ) + { + p->state = kEofState; + return kEofRC; + } + goto errLabel; + } + + assert( recordByteN <= p->bufMaxByteN ); + + // read the record data into p->buf[] + if((rc = file::read( p->fH, p->buf, recordByteN )) != kOkRC ) + goto errLabel; + + + // if all columns in the record do not have a fixed size then update + // the column pointers into the data record + if( !p->isFixedSizeFl ) + if((rc = _parse_var_record( p )) != kOkRC ) + goto errLabel; + + p->curRecordIdx = p->nextRecordIdx; + p->nextRecordIdx += 1; + errLabel: + return rc; + } + + rc_t _get( rdr_t* p, unsigned columnId, void*& vpRef, unsigned& nRef, const unsigned*& dimVRef, unsigned reqTypeId ) + { + const c_t* c;; + + if((c = _colFromId(p,columnId)) == nullptr ) + return kInvalidArgRC; + + if( c->col.typeId != reqTypeId ) + return cwLogError(kInvalidArgRC,"Cannot convert the column '%s' from type:%s to type:%s.", _typeIdToLabel(c->col.typeId), _typeIdToLabel(reqTypeId)); + + nRef = c->col.eleN; + dimVRef = c->col.dimV; + vpRef = p->buf + c->col.byteOffset; + return kOkRC; + } + + } + } +} + +cw::rc_t cw::dataset::rdr::create( handle_t& h, const char* fn ) +{ + rc_t rc; + if((rc = destroy(h)) != kOkRC ) + return rc; + + auto p = mem::allocZ(1); + + if((rc = file::open(p->fH, fn,file::kReadFl)) == kOkRC ) + if((rc = _readHdr(p)) != kOkRC ) + goto errLabel; + + p->state = kOkState; + p->curRecordIdx = kInvalidIdx; + h.set(p); + + errLabel: + if(rc != kOkRC ) + _destroy(p); + + return rc; +} + +cw::rc_t cw::dataset::rdr::destroy( handle_t& h ) +{ + rc_t rc = kOkRC; + + if( !h.isValid()) + return rc; + + rdr_t* p = _handleToPtr(h); + + if((rc = _destroy(p)) != kOkRC ) + return rc; + + h.clear(); + + return rc; +} + +unsigned cw::dataset::rdr::column_count( handle_t h ) +{ + rdr_t* p = _handleToPtr(h); + return p->column_count; +} + +const cw::dataset::rdr::col_t* cw::dataset::rdr::column_cfg( handle_t h, unsigned colIdx ) +{ + rdr_t* p = _handleToPtr(h); + + if( colIdx >= p->column_count ) + return nullptr; + + return &p->colA[colIdx].col; +} + +const cw::dataset::rdr::col_t* cw::dataset::rdr::column_cfg( handle_t h, const char* colLabel ) +{ + rdr_t* p = _handleToPtr(h); + const c_t* c; + + if((c = _colFromLabel(p, colLabel )) == nullptr ) + return nullptr; + + return &c->col; +} + +unsigned cw::dataset::rdr::record_count( handle_t h) +{ + rdr_t* p = _handleToPtr(h); + return p->record_count; +} + +unsigned cw::dataset::rdr::cur_record_index( handle_t h ) +{ + rdr_t* p = _handleToPtr(h); + return p->curRecordIdx; +} + +unsigned cw::dataset::rdr::next_record_index( handle_t h ) +{ + rdr_t* p = _handleToPtr(h); + return p->nextRecordIdx; +} + +unsigned cw::dataset::rdr::state( handle_t h ) +{ + rdr_t* p = _handleToPtr(h); + return p->state; +} + +cw::rc_t cw::dataset::rdr::seek( handle_t h, unsigned recordIdx ) +{ + rdr_t* p = _handleToPtr(h); + return _seek(p,recordIdx); +} + + +cw::rc_t cw::dataset::rdr::read( handle_t h, unsigned record_index ) +{ + rc_t rc = kOkRC; + rdr_t* p = _handleToPtr(h); + + if( record_index != kInvalidIdx ) + if((rc = _seek(p,record_index)) != kOkRC ) + return rc; + + return _read_record(p); +} + +cw::rc_t cw::dataset::rdr::get( handle_t h, unsigned columnId, const int*& vRef, unsigned& nRef, const unsigned*& dimVRef ) +{ + rdr_t* p = _handleToPtr(h); + void* vp = nullptr; + rc_t rc = _get(p, columnId, vp, nRef, dimVRef, kIntRdrFl ); + + vRef = rc!=kOkRC ? nullptr : static_cast(vp); + + return rc; +} + +cw::rc_t cw::dataset::rdr::get( handle_t h, unsigned columnId, const float*& vRef, unsigned& nRef, const unsigned*& dimVRef ) +{ + rdr_t* p = _handleToPtr(h); + void* vp = nullptr; + rc_t rc = _get(p, columnId, vp, nRef, dimVRef, kFloatRdrFl ); + + vRef = rc!=kOkRC ? nullptr : static_cast(vp); + + return rc; +} + +cw::rc_t cw::dataset::rdr::get( handle_t h, unsigned columnId, const double*& vRef, unsigned& nRef, const unsigned*& dimVRef ) +{ + rdr_t* p = _handleToPtr(h); + void* vp = nullptr; + rc_t rc = _get(p, columnId, vp, nRef, dimVRef, kDoubleRdrFl ); + + vRef = rc!=kOkRC ? nullptr : static_cast(vp); + + return rc; +} + +cw::rc_t cw::dataset::rdr::report( handle_t h ) +{ + rc_t rc = kOkRC; + rdr_t* p = _handleToPtr(h); + + for(unsigned i=0; icolumn_count; ++i) + { + const c_t* c = p->colA + i; + printf("id:%5i vdN:%5i mxEleN:%5i rank:%3i %8s", c->col.id, c->col.varDimN, c->col.maxEleN, c->col.rankN, _typeIdToLabel(c->col.typeId) ); + + printf(" min:"); variant::print(c->col.min); + printf(" max:"); variant::print(c->col.max); + + printf(" | "); + + for(unsigned j=0; jcol.rankN; ++j) + printf("%i ",c->col.dimV[j]); + + printf(" | "); + + for(unsigned j=0; jcol.rankN; ++j) + printf("%i ",c->col.maxDimV[j]); + + printf("\n"); + } + + return rc; +} + +cw::rc_t cw::dataset::rdr::test( const object_t* cfg ) +{ + rc_t rc = kOkRC; + char* inFn = nullptr; + handle_t h; + + if((rc = cfg->getv("inFn",inFn)) != kOkRC ) + return cwLogError(rc,"rdr test failed. Argument parse failed."); + + inFn = filesys::expandPath(inFn); + + if((rc = create(h,inFn)) != kOkRC ) + { + rc = cwLogError(rc,"rdr create failed."); + } + else + { + const int* v = nullptr; + unsigned vN = 0; + const unsigned* dimV = nullptr; + + report(h); + + while( (rc=read(h)) == kOkRC ) + { + get(h,0,v,vN,dimV); vop::print(v,vN,"%i ","c0:"); + get(h,1,v,vN,dimV); vop::print(v,vN,"%i ","c1:"); + get(h,2,v,vN,dimV); vop::print(v,vN,"%i ","c2:"); + get(h,3,v,vN,dimV); vop::print(v,vN,"%i ","c3:"); + } + + if( rc != kEofRC ) + rc = cwLogError(kOpFailRC,"The read operation failed."); + + destroy(h); + } + + return rc; +} + +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- +namespace cw { + namespace dataset { + namespace adapter { + + typedef struct col_str + { + const rdr::col_t* col; // Column description + bool oneHotFl; // Convert this column to a one-hot vector + unsigned maxEleN; // Max count of elements in the buffer from this column + int oneHotMax; // Max value in this column + int oneHotMin; // Min value in this column + unsigned* batchDimV; // batchDivV[ col.rankN, batchN ] or nullptr for fixed size columns + struct col_str* link; // + } col_t; + + typedef struct field_str + { + unsigned id; // Field Id + unsigned flags; // Field flags + bool isFixedSizeFl; // Do all columns in this field have a fixed size. + unsigned bytesPerEle; // Size of each element in buf[] (determined by flags | kfl) + unsigned bufMaxEleN; // Allocated size of buf[] for a batch size of maxBatchN + unsigned bufEleN; // Current count of elements in buf[] for the entire batch. + unsigned bufMaxFieldByteN; // Max. size in bytes of one field record. + unsigned bufByteN; // Current count of bytes in buf. + std::uint8_t* buf; // buf[ bufMaxFieldByteN*batchN ] + unsigned* batchEleNV; // batchEleN[ maxBatchN ] Count of ele's in each record of a batch. + col_t* colL; // List of columns assigned to this field + colMap_t** colMapM; // colMapM[ batchN ] + colMap_t* colMapA; // colMapA[ batchN*columnN ] Storage for colMapM[] + struct field_str* link; // + } field_t; + + typedef struct adapter_str + { + unsigned maxBatchN; // Max. possible value of batchN in a call to read(). + unsigned batchN; // Count of records returned in the last call to read(). + rdr::handle_t rdrH; // Source data file + field_t* fieldL; // List of field descriptions + unsigned state; // Exception state + } adapter_t; + + inline adapter_t* _handleToPtr(handle_t h ) + { return handleToPtr(h); } + + rc_t _destroy( adapter_t* p ) + { + rc_t rc = kOkRC; + + field_t* f = p->fieldL; + while( f != nullptr ) + { + field_t* f0 = f->link; + + col_t* c = f->colL; + while( c != nullptr ) + { + col_t* c0 = c->link; + + // if this is a var width column + if( c->col->varDimN > 0 ) + mem::release(c->batchDimV); + mem::release(c); + c = c0; + } + + mem::release(f->batchEleNV); + mem::release(f->buf); + mem::release(f->colMapM); + mem::release(f->colMapA); + mem::release(f); + f=f0; + } + + rdr::destroy(p->rdrH); + + mem::release(p); + return rc; + } + + field_t* _fieldIdToRecd( adapter_t* p, unsigned fieldId ) + { + field_t* f = p->fieldL; + for(; f!=nullptr; f=f->link) + if( f->id == fieldId ) + return f; + + cwLogError(kInvalidArgRC,"Invalid field id '%i'.",fieldId); + + return nullptr; + } + + rc_t _calc_one_hot_ele_count( col_t* c, unsigned& eleN_Ref ) + { + rc_t rc = kOkRC; + + if( !variant::isInt(c->col->min) || !variant::isInt(c->col->max) ) + return cwLogError(kInvalidArgRC,"One-hot columns must be integer valued."); + + if( c->col->rankN != 1 || c->col->maxDimV[0] != 1 ) + return cwLogError(kInvalidArgRC,"One-hot columns must be scalar integers."); + + + if((rc = variant::get(c->col->min,c->oneHotMin)) != kOkRC ) + return cwLogError(rc,"Unable to obtain the one-hot minimum value."); + + if((rc = variant::get(c->col->max,c->oneHotMax)) != kOkRC ) + return cwLogError(rc,"Unable to obtain the maximum value."); + + eleN_Ref = (c->oneHotMax - c->oneHotMin) + 1; + + return rc; + } + + rc_t _assign_column( adapter_t* p, field_t* f, const char* colLabel, bool oneHotFl ) + { + rc_t rc = kOkRC; + col_t* c = mem::allocZ(1); + + if((c->col = rdr::column_cfg(p->rdrH, colLabel)) == nullptr ) + rc = kInvalidArgRC; + else + { + c->oneHotFl = oneHotFl; + + // locate the last link in the column list + col_t* c0 = f->colL; + while( c0!=nullptr && c0->link != nullptr ) + c0=c0->link; + + // add the new record to the end of the list + if( c0 == nullptr ) + f->colL = c; + else + c0->link = c; + + // if one-hot encoding was requested + if( oneHotFl ) + rc = _calc_one_hot_ele_count(c,c->maxEleN); + else + c->maxEleN = c->col->maxEleN; + + // update the size of the field buffer to account for the column size + f->bufMaxEleN += c->col->maxEleN; + + // if this is a variable length column + if( c->col->varDimN > 0 ) + f->isFixedSizeFl = false; + + if( cwIsFlag(f->flags,kTrackColDimFl) ) + { + // if this is a fixed size column then batchDimV is null + // otherwise it is a [batchN,rankN] matrix used to hold the dim's of each returned data ele from this column + c->batchDimV = c->col->varDimN == 0 ? nullptr : mem::allocZ(p->maxBatchN*c->col->rankN); + } + } + + if( rc != kOkRC ) + rc = cwLogError(rc,"'%s' Column assignment failed.", cwStringNullGuard(colLabel)); + + return rc; + } + + rc_t _allocate_field_buffer( adapter_t* p, field_t* f ) + { + rc_t rc = kOkRC; + f->bufMaxEleN = 0; + + // calc the field width as the sum of the max column widths + unsigned colN = 0; + for(col_t* c=f->colL; c!=nullptr; c=c->link) + { + f->bufMaxEleN += c->maxEleN; + colN += 1; + } + + f->bufMaxFieldByteN = f->bufMaxEleN * f->bytesPerEle; + f->buf = mem::alloc(p->maxBatchN * f->bufMaxFieldByteN); + + // if col. dim tracking is enabled for this field + if( cwIsFlag(f->flags,kTrackColDimFl) ) + { + // allocate the column dim tracking data structures + f->colMapM = mem::allocZ( p->maxBatchN ); + f->colMapA = mem::allocZ( p->maxBatchN * colN ); + + // initialize the fixed portion of the col. tracking records + for(unsigned i=0; imaxBatchN; ++i) + { + f->colMapM[i] = f->colMapA + i*colN; + + // for batch index i for each column + unsigned j=0, eleOffs=0; + for(col_t* c=f->colL; c!=nullptr; c=c->link,++j) + { + f->colMapM[i][j].colId = c->col->id; + f->colMapM[i][j].rankN = c->col->rankN; + + // if this is a fixed size field then the col. map can be completely populated in advance of reading the data + // TODO: don't allocate the complete colMapA[] array because every colN records are duplicates anyway. + // just point colMapM[] to a single row of colMapA[]. + if( !f->isFixedSizeFl ) + { + f->colMapM[i][j].eleN = c->oneHotFl ? c->maxEleN : c->col->eleN; + f->colMapM[i][j].fieldEleOffset = eleOffs; + f->colMapM[i][j].dimV = c->col->dimV; + + eleOffs += c->col->eleN; + } + else + { + f->colMapM[i][j].dimV = c->batchDimV + (i*c->col->rankN); + } + } + } + } + return rc; + } + + template< typename S, typename D > + rc_t _translate_one_hot( std::uint8_t* buf, unsigned bufByteN, const S* src, unsigned srcEleN, const col_t* c, unsigned& dstByteNRef ) + { + rc_t rc = kOkRC; + + dstByteNRef = 0; + + unsigned dstEleN = (c->oneHotMax - c->oneHotMin) + 1; + unsigned dstByteN = dstEleN * sizeof(D); + + if( dstByteN > bufByteN ) + return cwLogError(kBufTooSmallRC,"The field buffer is too small (src:%i > buf:%i) during one-hot conversion.",dstByteN,bufByteN); + + if( srcEleN != 1 ) + return cwLogError(kInvalidArgRC,"One-hot encoded fields must be scalars. (srcEleN:%i)",srcEleN); + + unsigned oneHotIdx = src[0] - c->oneHotMin; + + if( oneHotIdx >= dstEleN ) + return cwLogError(kInvalidArgRC,"The one-hot index (%i) is out of the one-hot vector size:%i.",oneHotIdx,dstEleN); + + memset(buf,0,dstByteN); + + D* dst = reinterpret_cast(buf); + dst[ oneHotIdx ] = 1; + + dstByteNRef = dstByteN; + + return rc; + } + + template< typename S, typename D > + rc_t _translate_datatype( const col_t* c, std::uint8_t* buf, unsigned bufByteN, const S* src, unsigned srcEleN, unsigned& dstByteNRef ) + { + if( c->oneHotFl ) + return _translate_one_hot( buf, bufByteN, src, srcEleN, c, dstByteNRef ); + + + unsigned dstByteN = srcEleN * sizeof(D); + D* dst = reinterpret_cast(buf); + + dstByteNRef = 0; + + if( dstByteN > bufByteN ) + return cwLogError(kBufTooSmallRC,"The field buffer is too small (src:%i > buf:%i).",dstByteN,bufByteN); + + // copy, and translate, the rdr::col into the field->buf[] + for(unsigned i=0; i + rc_t _translate_column_tpl(adapter_t* p, field_t* f, col_t* c, std::uint8_t* buf, unsigned bufN, unsigned& dstByteNRef) + { + rc_t rc = kOkRC; + const T* v = nullptr; + unsigned vN = 0; + const unsigned* dimV = nullptr; + + // read the column + if((rc = rdr::get(p->rdrH, c->col->id, v, vN, dimV )) != kOkRC ) + return rc; + + switch( f->flags & kTypeMask ) + { + case kIntFl: rc = _translate_datatype( c, buf, bufN, v, vN, dstByteNRef ); break; + case kFloatFl: rc = _translate_datatype( c, buf, bufN, v, vN, dstByteNRef ); break; + case kDoubleFl: rc = _translate_datatype( c, buf, bufN, v, vN, dstByteNRef ); break; + default: + assert(0); + } + + + return rc; + } + + rc_t _translate_column( adapter_t* p, field_t* f, col_t* c, std::uint8_t* buf, unsigned bufN, unsigned& dstByteNRef ) + { + rc_t rc = kOkRC; + + switch( c->col->typeId ) + { + case rdr::kIntRdrFl: rc = _translate_column_tpl( p,f,c,buf,bufN,dstByteNRef); break; + case rdr::kFloatRdrFl: rc = _translate_column_tpl( p,f,c,buf,bufN,dstByteNRef); break; + case rdr::kDoubleRdrFl: rc = _translate_column_tpl(p,f,c,buf,bufN,dstByteNRef); break; + default: + assert(0); + } + + return rc; + } + + rc_t _read_field( adapter_t* p, unsigned batchIdx, field_t* f, unsigned& byteNRef ) + { + rc_t rc = kOkRC; + + byteNRef = 0; + + // on the first use the buffer will not yet be allocated + if( f->buf == nullptr ) + if((rc = _allocate_field_buffer(p,f)) != kOkRC ) + return rc; + + unsigned availBufByteN = f->bufMaxFieldByteN; + unsigned fieldBufByteOffs = 0; + + // for each column of this field + for(col_t* c=f->colL; c!=nullptr; c=c->link) + { + unsigned colByteN = 0; + + // translate each source column into the field buffer + if((rc = _translate_column( p, f, c, f->buf + f->bufByteN + fieldBufByteOffs, availBufByteN, colByteN )) != kOkRC ) + return rc; + + assert( availBufByteN >= colByteN ); + + availBufByteN -= colByteN; + fieldBufByteOffs += colByteN; + + // if column dim. tracking is enabled and this is a variable with column ... + if( cwIsFlag(f->flags,kTrackColDimFl) && c->col->varDimN>0 ) + for( unsigned i=0; icol->rankN; ++i) + c->batchDimV[ batchIdx * c->col->rankN + i] = c->col->dimV[i]; // ... get the dim's of this column + } + + byteNRef = fieldBufByteOffs; + return rc; + } + + template< typename T > + cw::rc_t _get( handle_t h, unsigned fieldId, const T*& vV, const unsigned*& nV ) + { + rc_t rc = kOkRC; + adapter_t* p = _handleToPtr(h); + field_t* f; + + if( p->state != kInitState ) + return cwLogError(kInvalidStateRC,"get() failed The adapter is in an invalid state (%i != %i).",p->state,kInitState); + + if((f = _fieldIdToRecd(p,fieldId)) == nullptr ) + return kInvalidArgRC; + + if(f->buf == nullptr ) + return cwLogError( kInvalidStateRC, "read() must be called begore get()."); + + vV = reinterpret_cast(f->buf); + nV = f->batchEleNV; + + return rc; + } + + template< typename T > + cw::rc_t _print_field( adapter_t* p, field_t* f, const char* fmt, unsigned batchIdx, const T* v, unsigned vN ) + { + rc_t rc = kOkRC; + unsigned i = 0,k = 0; + for(col_t* c=f->colL; c!=nullptr; c=c->link,++i) + { + colMap_t* cm = f->colMapM[batchIdx] + i; + + printf("| %s %i : ", c->col->label, cm->eleN ); + for(unsigned j=0; jeleN; ++j) + printf(fmt,v[k++]); + } + return rc; + } + + template< typename T > + cw::rc_t _print_field( adapter_t* p, field_t* f, const char* fmt ) + { + rc_t rc = kOkRC; + + printf("Field:%3i \n",f->id); + for(unsigned i=0,k=0; ibatchN; ++i) + { + printf("%i : ",i); + + T* v = reinterpret_cast(f->buf) + k; + unsigned vN = f->batchEleNV[i]; + + if( cwIsFlag(f->flags,kTrackColDimFl) ) + rc = _print_field(p,f,fmt,i,v,vN); + else + for(unsigned j=0; j(1); + + if((rc = rdr::create(p->rdrH,fn)) != kOkRC ) + goto errLabel; + + p->maxBatchN = maxBatchN; + p->state = kPreInitState; + + hRef.set(p); + + errLabel: + if( rc != kOkRC ) + _destroy(p); + + return rc; +} + +cw::rc_t cw::dataset::adapter::destroy( handle_t& hRef ) +{ + rc_t rc = kOkRC; + + if( !hRef.isValid() ) + return rc; + + adapter_t* p = _handleToPtr(hRef); + + if((rc = _destroy(p)) != kOkRC ) + return rc; + + hRef.clear(); + + return rc; +} + +cw::rc_t cw::dataset::adapter::create_field( handle_t h, unsigned fieldId, unsigned flags, const char* colLabel, bool oneHotFl ) +{ + rc_t rc = kOkRC; + adapter_t* p = _handleToPtr(h); + field_t* f = mem::allocZ(1); + unsigned typeFlags = flags & kTypeMask; + + f->id = fieldId; + f->flags = flags; + f->link = p->fieldL; + f->batchEleNV = mem::alloc(p->maxBatchN); + p->fieldL = f; + + switch( typeFlags ) + { + case kIntFl: f->bytesPerEle = sizeof(int); break; + case kFloatFl: f->bytesPerEle = sizeof(float); break; + case kDoubleFl: f->bytesPerEle = sizeof(double); break; + default: + rc = cwLogError(kInvalidArgRC,"The field data type value 0x%x is not valid.", typeFlags ); + } + + if( colLabel != nullptr ) + rc = _assign_column( p, f, colLabel, oneHotFl ); + + return rc; +} + +cw::rc_t cw::dataset::adapter::assign_column( handle_t h, unsigned fieldId, const char* colLabel, bool oneHotFl ) +{ + adapter_t* p = _handleToPtr(h); + const rdr::col_t* c = nullptr; + field_t* f; + + if(( c = rdr::column_cfg(p->rdrH,colLabel)) == nullptr ) + return kInvalidArgRC; + + if((f = _fieldIdToRecd(p,fieldId)) == nullptr ) + return kInvalidArgRC; + + + return _assign_column( p, f, colLabel, oneHotFl ); +} + +unsigned cw::dataset::adapter::record_count( handle_t h ) +{ + adapter_t* p = _handleToPtr(h); + return rdr::record_count(p->rdrH); +} + + +unsigned cw::dataset::adapter::field_fixed_ele_count( handle_t h, unsigned fieldId ) +{ + adapter_t* p = _handleToPtr(h); + field_t* f; + + if((f = _fieldIdToRecd(p,fieldId)) == nullptr ) + return 0; + + return f->bufEleN;; +} + + +cw::rc_t cw::dataset::adapter::read( handle_t h, unsigned batchN, const unsigned* recordIdxV ) +{ + rc_t rc = kOkRC; + adapter_t* p = _handleToPtr(h); + + switch( p->state ) + { + case kInitState: + break; + + case kPreInitState: + p->state = kInitState; + break; + + default: + return cwLogError(kInvalidStateRC,"Invalid adapter state (%i != %i).",p->state,kInitState); + } + + + if( batchN > p->maxBatchN ) + return cwLogError(kInvalidArgRC,"The batch count:%i is greater than the max batch count:%i.",batchN,p->maxBatchN); + + p->batchN = 0; + + // for each record in this batch + for(unsigned i=0; irdrH, recordIdxV==nullptr ? kInvalidIdx : recordIdxV[i] )) != kOkRC ) + { + if( rc == kEofRC ) + p->state = kEofState; + + goto errLabel; + } + + // translate each field + for(field_t* f=p->fieldL; f!=nullptr; f=f->link) + { + unsigned fieldByteN = 0; + if( i == 0 ) + { + f->bufEleN = 0; + f->bufByteN = 0; + } + + // read the field into f->buf[] + if((rc = _read_field(p,i,f,fieldByteN)) != kOkRC ) + { + rc = cwLogError(rc,"Field (id:%i) read failed.",f->id); + goto errLabel; + } + + assert( fieldByteN % f->bytesPerEle == 0 ); + + // update the buffer state + unsigned fieldEleN = fieldByteN / f->bytesPerEle; + f->bufEleN += fieldEleN; + f->bufByteN += fieldByteN; + f->batchEleNV[i] = fieldEleN; + } + + p->batchN += 1; + } + errLabel: + if( rc != kOkRC ) + p->state = kErrorState; + + return rc; +} + +cw::rc_t cw::dataset::adapter::get( handle_t h, unsigned fieldId, const int*& vV, const unsigned*& nV ) +{ return _get(h,fieldId,vV,nV); } + +cw::rc_t cw::dataset::adapter::get( handle_t h, unsigned fieldId, const float*& vV, const unsigned*& nV ) +{ return _get(h,fieldId,vV,nV); } + +cw::rc_t cw::dataset::adapter::get( handle_t h, unsigned fieldId, const double*& vV, const unsigned*& nV ) +{ return _get(h,fieldId,vV,nV); } + + +cw::rc_t cw::dataset::adapter::column_map( handle_t h, unsigned fieldId, colMap_t const * const *& colMapV_Ref ) +{ + rc_t rc = kOkRC; + adapter_t* p = _handleToPtr(h); + field_t* f; + + if( p->state != kInitState ) + return cwLogError(kInvalidStateRC,"Invalid adapter state (%i != %i).",p->state,kInitState); + + if((f = _fieldIdToRecd(p,fieldId)) == nullptr ) + return kInvalidArgRC; + + colMapV_Ref = f->colMapM; + + return rc; +} + + +unsigned cw::dataset::adapter::state( handle_t h ) +{ + adapter_t* p = _handleToPtr(h); + return p->state; +} + +cw::rc_t cw::dataset::adapter::print_field( handle_t h, unsigned fieldId, const char* fmt ) +{ + rc_t rc = kOkRC; + adapter_t* p = _handleToPtr(h); + field_t* f; + + if((f = _fieldIdToRecd(p,fieldId)) == nullptr ) + return cwLogError(kInvalidArgRC,"Invalid field id (%i).",fieldId); + + switch( f->flags & kTypeMask ) + { + case kIntFl: rc = _print_field( p, f, fmt==nullptr ? "%i " : fmt ); break; + case kFloatFl: rc = _print_field( p, f, fmt==nullptr ? "%f " : fmt ); break; + case kDoubleFl: rc = _print_field(p, f, fmt==nullptr ? "%f " : fmt ); break; + default: + rc = cwLogError(kInvalidArgRC,"Unknown type flag: 0x%x.",f->flags & kTypeMask); + } + return rc; +} + + +cw::rc_t cw::dataset::adapter::test( const object_t* cfg ) +{ + rc_t rc = kOkRC; + char* inFn = nullptr; + unsigned batchN = 0; + handle_t h; + + enum { + kField0Id = 0, + kField1Id = 1 + }; + + // read the cfg args + if((rc = cfg->getv("inFn",inFn,"batchN",batchN)) != kOkRC ) + return cwLogError(rc,"adapter test failed. Argument parse failed."); + + inFn = filesys::expandPath(inFn); + + // create the adapter + if((rc = create(h, inFn, batchN)) != kOkRC ) + { + rc = cwLogError(rc,"Unable to create dataset adapter for '%s'.",inFn); + goto errLabel; + } + else + { + const int* xV = nullptr; + const float* yV = nullptr; + const unsigned* xNV = nullptr; + const unsigned* yNV = nullptr; + unsigned recdIdxV[] = { 2,1,0 }; + + if((rc = create_field( h, kField0Id, kIntFl | kTrackColDimFl, "col0", true )) != kOkRC ) + goto errLabel; + + if((rc = create_field( h, kField1Id, kFloatFl | kTrackColDimFl, "col1" )) != kOkRC ) + goto errLabel; + + if((rc = assign_column( h, kField1Id, "col2" )) != kOkRC ) + goto errLabel; + + if((rc = assign_column( h, kField1Id, "col3" )) != kOkRC ) + goto errLabel; + + assert( cwCountOf(recdIdxV) == batchN ); + + if((rc = read(h, batchN, recdIdxV )) != kOkRC ) + goto errLabel; + + if((rc = get(h, kField0Id, xV, xNV )) != kOkRC ) + goto errLabel; + + if((rc = get(h, kField1Id, yV, yNV )) != kOkRC ) + goto errLabel; + + for(unsigned i=0,n0=0,n1=0; itrain); - mtx::release(p->valid); - mtx::release(p->test); + mem::release(p->dataM); + mem::release(p->labelV); + mem::release(p->trainFn); + mem::release(p->validFn); + mem::release(p->testFn); mem::release(p); return rc; } - rc_t _read_file( const char* dir, const char* fn, mtx::f_t*& m ) + rc_t _read_file_record_count( const char* fn, unsigned& nRef ) { - rc_t rc = kOkRC; + rc_t rc; file::handle_t fH; - unsigned exampleN = 0; - const unsigned kPixN = 784; - const unsigned kRowN = kPixN+1; - unsigned dimV[] = {kRowN,0}; - const unsigned dimN = sizeof(dimV)/sizeof(dimV[0]); - float* v = nullptr; - char* path = filesys::makeFn(dir, fn, ".bin", NULL ); - + // open the file - if((rc = file::open(fH,path, file::kReadFl | file::kBinaryFl )) != kOkRC ) + if((rc = file::open(fH, fn, file::kReadFl | file::kBinaryFl )) != kOkRC ) { - rc = cwLogError(rc,"MNIST file open failed on '%s'.",cwStringNullGuard(path)); + rc = cwLogError(rc,"MNIST file open failed on '%s'.",cwStringNullGuard(fn)); goto errLabel; } // read the count of examples - if((rc = readUInt(fH,&exampleN)) != kOkRC ) + if((rc = read(fH,nRef)) != kOkRC ) { rc = cwLogError(rc,"Unable to read MNIST example count."); goto errLabel; } - // allocate the data memory - v = mem::alloc( kRowN * exampleN ); - - // read each example - for(unsigned i=0,j=0; ikPixN, p->kPixN)) != kOkRC ) { rc = cwLogError(rc,"Unable to read MNIST data vector on example %i.",i); goto errLabel; @@ -91,12 +2082,12 @@ namespace cw } - dimV[1] = exampleN; - m = mtx::alloc( dimV, dimN, v, mtx::kAliasReleaseFl ); errLabel: + if( rc != kOkRC) + rc = cwLogError(rc,"Load failed on MNIST file %s.",cwStringNullGuard(fn)); + file::close(fH); - mem::release(path); return rc; } } @@ -106,32 +2097,57 @@ namespace cw cw::rc_t cw::dataset::mnist::create( handle_t& h, const char* dir ) { - rc_t rc; - mnist_t* p = nullptr; + rc_t rc; + mnist_t* p = nullptr; + unsigned trainN = 0; + unsigned validN = 0; + unsigned testN = 0; if((rc = destroy(h)) != kOkRC ) return rc; char* inDir = filesys::expandPath(dir); - p = mem::allocZ(1); + // allocate the object + p = mem::allocZ(1); + p->kPixN = 784; + + p->trainFn = filesys::makeFn(inDir, "mnist_train", ".bin", NULL ); + p->validFn = filesys::makeFn(inDir, "mnist_valid", ".bin", NULL ); + p->testFn = filesys::makeFn(inDir, "mnist_test", ".bin", NULL ); + + mem::release(inDir); + + _read_file_record_count( p->trainFn, trainN ); + p->exampleN += trainN; + + _read_file_record_count( p->validFn, validN ); + p->exampleN += validN; + + _read_file_record_count( p->testFn, testN ); + p->exampleN += testN; + + + // allocate the data memory + p->dataM = mem::alloc( p->kPixN * p->exampleN ); + p->labelV = mem::alloc( p->exampleN ); // read the training data - if((rc = _read_file( inDir, "mnist_train", p->train )) != kOkRC ) + if((rc = _read_file( p, p->trainFn, trainN, p->dataM, p->labelV )) != kOkRC ) { rc = cwLogError(rc,"MNIST training set load failed."); goto errLabel; } // read the validation data - if((rc = _read_file( inDir, "mnist_valid", p->valid )) != kOkRC ) + if((rc = _read_file( p, p->validFn, validN, p->dataM + p->kPixN*trainN, p->labelV + trainN )) != kOkRC ) { rc = cwLogError(rc,"MNIST validation set load failed."); goto errLabel; } // read the testing data - if((rc = _read_file( inDir, "mnist_test", p->test )) != kOkRC ) + if((rc = _read_file( p, p->testFn, testN, p->dataM + p->kPixN*(trainN +validN), p->labelV + (trainN + validN) )) != kOkRC ) { rc = cwLogError(rc,"MNIST test set load failed."); goto errLabel; @@ -164,31 +2180,131 @@ cw::rc_t cw::dataset::mnist::destroy( handle_t& h ) return rc; } -const cw::mtx::f_t* cw::dataset::mnist::train( handle_t h ) + +unsigned cw::dataset::mnist::record_count( handle_t h ) { mnist_t* p = _handleToPtr(h); - return p->train; + return p->exampleN; } -const cw::mtx::f_t* cw::dataset::mnist::validate( handle_t h ) +cw::rc_t cw::dataset::mnist::seek( handle_t h, unsigned exampleIdx ) { - mnist_t* p = _handleToPtr(h); - return p->valid; + rc_t rc = kOkRC; + mnist_t* p = _handleToPtr(h); + + if( exampleIdx <= p->exampleN ) + p->curIdx = exampleIdx; + else + rc = cwLogError(kSeekFailRC,"Illegal seek index. Seek failed."); + + return rc; } -const cw::mtx::f_t* cw::dataset::mnist::test( handle_t h ) +cw::rc_t cw::dataset::mnist::dataM( handle_t h, const float*& dataM_Ref, const unsigned*& labelV_Ref, unsigned exampleN, unsigned& actualExampleN_Ref, unsigned exampleIdx ) { - mnist_t* p = _handleToPtr(h); - return p->test; + rc_t rc = kOkRC; + mnist_t* p = _handleToPtr(h); + + if( exampleIdx == kInvalidIdx ) + exampleIdx = p->curIdx; + + if( exampleIdx >= p->exampleN ) + return kEofRC; + + if( exampleIdx + exampleN > p->exampleN ) + exampleN = p->exampleN - exampleIdx; + + //memcpy(dataM, p->dataM + exampleIdx * p->kPixN, exampleN * p->kPixN * sizeof(p->dataM[0]) ); + //memcpy(labelV, p->labelV + exampleIdx, exampleN * sizeof(p->labelV[0]) ); + + dataM_Ref = p->dataM + exampleIdx * p->kPixN; + labelV_Ref = p->labelV + exampleIdx; + + actualExampleN_Ref = exampleN; + + p->curIdx += exampleN; + + return rc; } - -cw::rc_t cw::dataset::mnist::test( const char* dir, const char* imageFn ) +cw::rc_t cw::dataset::mnist::write( handle_t h, const char* fn ) +{ + rc_t rc = kOkRC; + unsigned recdN = record_count(h); + wtr::handle_t wtrH; + + if((rc = wtr::create(wtrH,fn)) != kOkRC ) + return cwLogError(rc,"Dataset wtr create failed."); + + enum { kImagId, kNumbId }; + unsigned numbDimV[] = {1}; + unsigned imagDimV[] = {28,28}; + unsigned imagEleN = imagDimV[0]*imagDimV[1]; + + if((rc = define_columns( wtrH, "numb", kNumbId, cwCountOf(numbDimV), numbDimV )) != kOkRC ) + goto errLabel; + + if((rc = define_columns( wtrH, "imag", kImagId, cwCountOf(imagDimV), imagDimV )) != kOkRC ) + goto errLabel; + + printf("recdN: %i\n",recdN); + + for(unsigned i=0; i < recdN; ) + { + const float* imagM = nullptr; + const unsigned* numbV = nullptr; + unsigned cacheRecdN = std::min(100u,recdN-i); + unsigned actRecdN = 0; + + if((rc = dataM(h, imagM, numbV, cacheRecdN, actRecdN, i )) != kOkRC ) + { + cwLogError(rc,"Extract image data failed."); + goto errLabel; + } + + for(unsigned j=0; jgetv("inDir",inDir,"outHtmlFn",outHtmlFn)) != kOkRC ) + return cwLogError(rc,"MNIST test failed. Argument parse failed."); + + inDir = filesys::expandPath(inDir); + outHtmlFn = filesys::expandPath(outHtmlFn); + + if((rc = create(h, inDir )) == kOkRC ) { svg::handle_t svgH; @@ -196,38 +2312,50 @@ cw::rc_t cw::dataset::mnist::test( const char* dir, const char* imageFn ) rc = cwLogError(rc,"SVG Test failed on create."); else { - const mtx::f_t* m = train(h); + //const mtx::f_t* m = train(h); + /* - unsigned zn = 0; - unsigned i = 1; - for(; idimV[1]; ++i) - { + unsigned zn = 0; + unsigned i = 1; + for(; idimV[1]; ++i) + { const float* v0 = m->base + (28*28+1) * (i-1) + 1; const float* v1 = m->base + (28*28+1) * (i-0) + 1; float d = 0; for(unsigned j=0; j<28*28; ++j) - d += fabs(v0[j]-v1[j]); + d += fabs(v0[j]-v1[j]); if( d==0 ) - ++zn; + ++zn; else { - printf("%i %i %f\n",i,zn,d); - zn = 0; + printf("%i %i %f\n",i,zn,d); + zn = 0; + } } - } - printf("i:%i n:%i zn:%i\n",i,m->dimV[1],zn); + printf("i:%i n:%i zn:%i\n",i,m->dimV[1],zn); */ + + const float* dataM = nullptr; + const unsigned* labelV = nullptr; + unsigned exampleN = 10; + unsigned actualExampleN = 0; + + //mnist::seek( h, 10 ); + mnist::dataM( h, dataM, labelV, exampleN, actualExampleN ); + - for(unsigned i=0; i<10; ++i) + for(unsigned i=0; ibase + (28*28+1)*i, 28, 28, 5, svg::kInvGrayScaleColorMapId); + svg::image(svgH, dataM + (28*28)*i, 28, 28, 5, svg::kInvGrayScaleColorMapId); } - svg::write(svgH, imageFn, nullptr, svg::kStandAloneFl | svg::kGenInlineStyleFl, 10,10,10,10); + svg::write(svgH, outHtmlFn, nullptr, svg::kStandAloneFl | svg::kGenInlineStyleFl, 10,10,10,10); svg::destroy(svgH); @@ -236,519 +2364,123 @@ cw::rc_t cw::dataset::mnist::test( const char* dir, const char* imageFn ) rc = destroy(h); } + mem::release(outHtmlFn); + mem::release(inDir); return rc; } +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- +//---------------------------------------------------------------------------------------------------------------------------- -namespace cw +cw::rc_t cw::dataset::test( const object_t* cfg ) { - namespace dataset + + rc_t rc = kOkRC; + char* inDir = nullptr; + char* dsFn = nullptr; + char* outHtmlFn = nullptr; + mnist::handle_t mniH; + adapter::handle_t rdrH; + svg::handle_t svgH; + unsigned batchN = 10; + + + if((rc = cfg->getv("inDir",inDir,"dsFn",dsFn,"outHtmlFn",outHtmlFn,"batchN",batchN)) != kOkRC ) + return cwLogError(rc,"MNIST test failed. Argument parse failed."); + + inDir = filesys::expandPath(inDir); + dsFn = filesys::expandPath(dsFn); + outHtmlFn = filesys::expandPath(outHtmlFn); + + // open the native MNIST object + if((rc = mnist::create(mniH, inDir )) != kOkRC ) { - - //--------------------------------------------------------------------------------------------------------------- - // struct matrix_str - // - - template< typename T > - struct matrix_str + cwLogError(rc,"Unable to open the native MNIST object."); + goto errLabel; + } + else + { + // write the MNIST data to a dataset file + if((rc = mnist::write(mniH, dsFn)) != kOkRC ) { - struct mtx::mtx_str* dataM; - struct mtx::mtx_str* labelM; - }; - - template< typename T0, typename T1 > - void _matrix_load( struct matrix_str& m, const struct mtx::mtx_str& dataM, const struct mtx::mtx_str& labelM ) - { - m.dataM = mtx::alloc(dataM,nullptr,nullptr); - m.labelM = mtx::alloc(labelM,nullptr,nullptr); - } - - template< typename T > - void _matrix_release( struct matrix_str& m ) - { - mtx::release(m.dataM); - mtx::release(m.labelM); + cwLogError(rc,"MNIST dataset write failed"); + goto errLabel; } - - //--------------------------------------------------------------------------------------------------------------- - // example_t - // + mnist::destroy(mniH); - typedef struct examples_str - { - unsigned type; - union - { - struct matrix_str f; - struct matrix_str d; - } u; - } examples_t; + } - template< typename T > - rc_t _examples_load( examples_t& ex, unsigned dstTypeFlag, const struct mtx::mtx_str& dataM, const struct mtx::mtx_str& labelM ) + // open a dataset adapter + if((rc = adapter::create(rdrH,dsFn,batchN)) != kOkRC ) + { + cwLogError(rc,"Dataset reader create failed."); + goto errLabel; + } + else + { + // create an SVG file + if((rc = svg::create(svgH)) != kOkRC ) + rc = cwLogError(rc,"SVG writer create failed."); + else { - rc_t rc = kOkRC; - - switch( dstTypeFlag ) - { - case kFloatFl: - _matrix_load(ex.u.f,dataM,labelM); - ex.type = dstTypeFlag; - break; - - case kDoubleFl: - _matrix_load(ex.u.d,dataM,labelM); - ex.type = dstTypeFlag; - break; - default: - rc = cwLogError(kInvalidArgRC,"An invalid example type (%i) was encountered.", dstTypeFlag); + enum { kImagId, kNumbId }; + + // create dataset fields + if((rc = create_field( rdrH, kImagId, adapter::kFloatFl, "imag" )) != kOkRC ) + { + cwLogError(rc,"Dataset rdr column define failed."); + goto errLabel; } - return rc; - } - - void _examples_destroy( examples_t& ex ) - { - switch( ex.type ) + if((rc = create_field( rdrH, kNumbId, adapter::kIntFl, "numb" )) != kOkRC ) { - case kFloatFl: _matrix_release(ex.u.f); break; - case kDoubleFl: _matrix_release(ex.u.d); break; - } - } - - rc_t _examples_data_dimV( const examples_t& ex, const unsigned*& dimV, unsigned& dimN ) - { - switch( ex.type ) - { - case kFloatFl: dimV=ex.u.f.dataM->dimV; dimN=ex.u.f.dataM->dimN; break; - case kDoubleFl: dimV=ex.u.d.dataM->dimV; dimN=ex.u.d.dataM->dimN; break; - default: - assert(0); - } - return kOkRC; - } - - rc_t _examples_label_dimV( const examples_t& ex, const unsigned*& dimV, unsigned& dimN ) - { - switch( ex.type ) - { - case kFloatFl: dimV=ex.u.f.labelM->dimV; dimN=ex.u.f.labelM->dimN; break; - case kDoubleFl: dimV=ex.u.d.labelM->dimV; dimN=ex.u.d.labelM->dimN; break; - default: - assert(0); - } - return kOkRC; - } - - rc_t _examples_batch_f( const examples_t& ex, unsigned dataOffsetN, unsigned labelOffsetN, const float*& dataM, const float*& labelM ) - { - dataM = ex.u.f.dataM->base + dataOffsetN; - labelM = ex.u.f.labelM->base + labelOffsetN; - - return kOkRC; - } - - rc_t _examples_batch_d( const examples_t& ex, unsigned dataOffsetN, unsigned labelOffsetN, const double*& dataM, const double*& labelM ) - { - dataM = ex.u.d.dataM->base + dataOffsetN; - labelM = ex.u.d.labelM->base + labelOffsetN; - - return kOkRC; - } - - //--------------------------------------------------------------------------------------------------------------- - // datasubset_t - // - - typedef struct datasubset_str - { - examples_t examples; - unsigned batchN; - unsigned iterIdx; - unsigned iterN; - } datasubset_t; - - void _datasubset_destroy( datasubset_str& ss ) - { - ss.iterIdx = 0; - ss.iterN = 0; - _examples_destroy(ss.examples); - } - - template< typename T > - rc_t _datasetsubset_load( datasubset_t& ss, unsigned dstTypeFlag, unsigned batchN, const struct mtx::mtx_str& dataM, const struct mtx::mtx_str& labelM ) - { - unsigned exampleN = 0; - switch( dataM.dimN ) - { - case 2: exampleN = dataM.dimV[1]; break; - case 3: exampleN = dataM.dimV[2]; break; - default: - cwLogError(kInvalidArgRC,"The dataset must be contained in a matrix of 2 or 3 dimensions."); + cwLogError(rc,"Dataset rdr column define failed."); + goto errLabel; } - ss.batchN = batchN; - ss.iterN = exampleN/batchN; - return _examples_load( ss.examples, dstTypeFlag, dataM, labelM ); - } - - rc_t _datasubset_data_dimV( const datasubset_t& ss, const unsigned*& dimV, unsigned& dimN ) - { return _examples_data_dimV( ss.examples, dimV, dimN ); } - - rc_t _datasubset_label_dimV( const datasubset_t& ss, const unsigned*& dimV, unsigned& dimN ) - { return _examples_label_dimV( ss.examples, dimV, dimN ); } - - rc_t _datasubset_batch_f( datasubset_t& ss, unsigned dataOffsetN, unsigned labelOffsetN, const float*& dataM, const float*& labelM ) - { - rc_t rc; - - if( ss.iterIdx >= ss.iterN ) - return kEofRC; - - rc = _examples_batch_f( ss.examples, dataOffsetN * ss.iterIdx, labelOffsetN * ss.iterIdx, dataM, labelM ); - - ++ss.iterIdx; - return rc; - } - - rc_t _datasubset_batch_d( datasubset_t& ss, unsigned dataOffsetN, unsigned labelOffsetN, const double*& dataM, const double*& labelM ) - { - rc_t rc; - - if( ss.iterIdx >= ss.iterN ) - return kEofRC; - - rc = _examples_batch_d( ss.examples, dataOffsetN * ss.iterIdx, labelOffsetN * ss.iterIdx, dataM, labelM ); - - ++ss.iterIdx; - return rc; - } - - //--------------------------------------------------------------------------------------------------------------- - // datasetMgr_t - // - - enum - { - kTrainSsIdx, - kValidSsIdx, - kTestSsIdx, - kDataSubSetN - }; - - typedef struct datasetMgr_str - { - const object_t* cfg; - unsigned typeFlag; - datasubset_t ssA[ kDataSubSetN ]; - unsigned dataRealN; - unsigned labelRealN; - } datasetMgr_t; - - datasetMgr_t* _handleToPtr( handle_t h ) - { return handleToPtr< handle_t, datasetMgr_t >(h); } - - unsigned _ssFlagToIndex( unsigned flags ) - { - flags &= (kTrainSsFl | kValidSsFl | kTestSsFl ); - - switch( flags ) + // read a batch of data + if((rc = adapter::read( rdrH, batchN)) != kOkRC ) { - case kTrainSsFl: return kTrainSsIdx; - case kValidSsFl: return kValidSsIdx; - case kTestSsFl: return kTestSsIdx; - } - - cwLogError(kInvalidArgRC,"Invalid subset flags (0x%x).", flags ); - return kInvalidIdx; - } - - void _unload( datasetMgr_t* p ) - { - for(unsigned i=0; issA[i] ); - } - - rc_t _destroy( datasetMgr_t* p ) - { - _unload(p); - mem::release(p); - - return kOkRC; - } - - - unsigned _mtx_to_realN( const mtx::f_t& m ) - { - switch( m.dimN ) - { - case 1: return 1; - case 2: return m.dimV[0]; - case 3: return m.dimV[0] * m.dimV[1]; - } - - cwLogError(kInvalidArgRC,"%i invalid matrix rank.",m.dimN); - return 0; - } - - //rc_t _load( datasetMgr_t* p, unsigned ssFlags, unsigned batchN, const mtx::f_t& dataM, const mtx::f_t& labelM ) - - template< typename T > - rc_t _load( datasetMgr_t* p, unsigned ssFlags, unsigned batchN, const struct mtx::mtx_str& dataM, const struct mtx::mtx_str& labelM ) - { - rc_t rc = kOkRC; - unsigned ssIdx; - if(( ssIdx = _ssFlagToIndex(ssFlags)) != kInvalidIdx ) - if((rc = _datasetsubset_load( p->ssA[ssIdx], p->typeFlag, batchN, dataM, labelM )) != kOkRC ) - { - p->dataRealN = _mtx_to_realN(dataM); - p->labelRealN = _mtx_to_realN(labelM); - return kOkRC; - } - - return kInvalidArgRC; - } - - rc_t _mnist_load_subset( datasetMgr_t* p, unsigned ssFlags, unsigned batchN, const mtx::f_t& m ) - { - rc_t rc = kOkRC; - mtx::f_t* labelM = mtx::slice_alias(m,0,0,1); // the first row contains the labels - mtx::f_t* dsM = mtx::slice_alias(m,1,0); // all successive rows contain the data - mtx::f_t* oneHotM = mtx::alloc_one_hot(*labelM); // convert the labels to a one hot encoding - - //unsigned dsExampleN = mtx::ele_count(*labelM); // total count of examples in this dataset - - rc = _load( p, ssFlags, batchN, *dsM, *oneHotM ); - - // Inform the matrix objects that the ownership - // of the data and dimV memory from 'dsM' and 'oneHotM' - // has been taken over by the dataset object. - //clear_memory_release_flag( *oneHotM ); - //clear_memory_release_flag( *dsM ); - - mtx::release(labelM); - mtx::release(oneHotM); - mtx::release(dsM); - - return rc; - } - - rc_t _mnist_load( datasetMgr_t* p, const object_t* ele, unsigned batchN, unsigned flags ) - { - rc_t rc = kOkRC; - const char* inDir = nullptr; - mnist::handle_t mnistH; - - // locate - if( ele->get("inDir",inDir) != kOkRC ) - return cwLogError(kSyntaxErrorRC,"MNIST 'indir' cfg. label not found."); - - if( (rc = mnist::create(mnistH, inDir )) != kOkRC ) - { - return cwLogError(rc,"MNIST dataset instantiation failed."); + cwLogError(rc,"Batch read failed."); + goto errLabel; } else { - - const mtx::f_t* rM = mnist::train(mnistH); - const mtx::f_t* vM = mnist::validate(mnistH); - const mtx::f_t* tM = mnist::test(mnistH); - - - _mnist_load_subset( p, kTrainSsFl, batchN, *rM ); - _mnist_load_subset( p, kValidSsFl, batchN, *vM ); - _mnist_load_subset( p, kTestSsFl, batchN, *tM ); + const int* numbV = nullptr; + const unsigned* numbNV = nullptr; + const float* imagV = nullptr; + const unsigned* imagNV = nullptr; - mnist::destroy(mnistH); - } + adapter::get(rdrH, kNumbId, numbV, numbNV ); // get the labels + adapter::get(rdrH, kImagId, imagV, imagNV ); // get the image data - return rc; - } - } -} - -cw::rc_t cw::dataset::create( handle_t& h, const object_t* cfg, unsigned flags ) -{ - rc_t rc; - if((rc = destroy(h)) != kOkRC ) - return rc; - - datasetMgr_t* p = mem::allocZ(1); - - p->cfg = cfg; - p->typeFlag = flags; - h.set(p); - - return rc; -} - -cw::rc_t cw::dataset::destroy( handle_t& h ) -{ - rc_t rc = kOkRC; - - if( !h.isValid() ) - return kOkRC; - - datasetMgr_t* p = _handleToPtr(h); - - if((rc = _destroy(p)) != kOkRC ) - return rc; - - h.clear(); - - return rc; -} - -cw::rc_t cw::dataset::load( handle_t h, const char* dsLabel, unsigned batchN, unsigned validPct, unsigned testPct, unsigned flags ) -{ - rc_t rc = kOkRC; - datasetMgr_t* p = _handleToPtr(h); - const object_t* dataL = p->cfg->find("dataL"); - - // empty the data mgr x_dsA[] before loading the next dataset - _unload(p); - - - // for each possible dataset - for(unsigned i=0; ichild_count(); ++i) - { - const object_t* ele = dataL->child_ele(i); - const char* label = nullptr; - - // get the name of this dataset - if( ele->get("name", label ) != kOkRC ) - { - // all ele's must have a 'name' field - cwLogError(kLabelNotFoundRC,"Dataset cfg. element at index %i does not have a 'name' field.",i); - goto errLabel; - } - - // if this is the target dataset - if( strcmp(dsLabel,label) == 0 ) - { - if( strcmp(label,"mnist") == 0 ) - return _mnist_load(p, ele, batchN,flags); - } - - - } - - errLabel: - return rc; -} - - - -cw::rc_t cw::dataset::subset_dims( handle_t h, unsigned subsetFl, const unsigned*& dimV_Ref, unsigned& dimN_Ref ) -{ - datasetMgr_t* p = _handleToPtr(h); - unsigned ssIdx; - - if((ssIdx = _ssFlagToIndex(subsetFl)) == kInvalidIdx ) - return kInvalidArgRC; - - return _datasubset_data_dimV( p->ssA[ssIdx], dimV_Ref, dimN_Ref ); -} - -cw::rc_t cw::dataset::label_dims( handle_t h, unsigned subsetFl, const unsigned*& dimV_Ref, unsigned& dimN_Ref ) -{ - datasetMgr_t* p = _handleToPtr(h); - unsigned ssIdx; - - if((ssIdx = _ssFlagToIndex(subsetFl)) == kInvalidIdx ) - return kInvalidArgRC; - - return _datasubset_label_dimV( p->ssA[ssIdx], dimV_Ref, dimN_Ref ); -} - -cw::rc_t cw::dataset::batch_f( handle_t h, unsigned subsetFl, const float*& dataM_Ref, const float*& labelM_Ref ) -{ - datasetMgr_t* p = _handleToPtr(h); - unsigned ssIdx; - - if((ssIdx = _ssFlagToIndex(subsetFl)) == kInvalidIdx ) - return kInvalidArgRC; - - return _datasubset_batch_f( p->ssA[ssIdx], p->dataRealN, p->labelRealN, dataM_Ref, labelM_Ref ); -} - -cw::rc_t cw::dataset::batch_d( handle_t h, unsigned subsetFl, const double*& dataM_Ref, const double*& labelM_Ref ) -{ - datasetMgr_t* p = _handleToPtr(h); - unsigned ssIdx; - - if((ssIdx = _ssFlagToIndex(subsetFl)) == kInvalidIdx ) - return kInvalidArgRC; - - return _datasubset_batch_d( p->ssA[ssIdx], p->dataRealN, p->labelRealN, dataM_Ref, labelM_Ref ); -} - - - -cw::rc_t cw::dataset::test( const object_t* cfg ) -{ - handle_t h; - rc_t rc = kOkRC; - const char* dsLabel = nullptr; - unsigned batchN = 64; - unsigned validPct = 10; - unsigned testPct = 10; - unsigned typeFlag = kFloatFl; - time::spec_t t0; - const float* dataM = nullptr; - const float* labelM = nullptr; - const unsigned *dataDimV = nullptr; - const unsigned *labelDimV = nullptr; - unsigned dataDimN = 0; - unsigned labelDimN = 0; - unsigned batchCnt = 0; - time::get(t0); - - if((rc = cfg->getv("dsLabel",dsLabel,"batchN",batchN,"validPct",validPct,"testPct",testPct)) != kOkRC ) - return cwLogError(rc,"Dataset test failed. Argument parse failed."); - - if((rc = create(h,cfg,typeFlag)) != kOkRC ) - return cwLogError(rc,"Dataset manager create failed."); - - if((rc = load(h, dsLabel, batchN, validPct, testPct, kDoubleFl )) != kOkRC ) - { - cwLogError(rc,"'%s' dataset load failed.", cwStringNullGuard(dsLabel)); - goto errLabel; - } - - if((rc = subset_dims(h,kTrainSsFl,dataDimV, dataDimN )) != kOkRC ) - goto errLabel; - - if((rc = label_dims(h,kTrainSsFl,labelDimV, labelDimN )) != kOkRC ) - goto errLabel; - - vop::print(dataDimV,dataDimN,"%i ","data: "); - vop::print(labelDimV,labelDimN,"%i ","label: "); - - batchCnt = dataDimV[1]/batchN; - printf("batchCnt:%i\n",batchCnt); - - for(unsigned i=0; true; ++i ) - { - if((rc = batch_f(h,kTrainSsFl,dataM,labelM)) != kOkRC ) - { - printf("rc:%i : %i %i\n",rc,batchCnt,i); - break; - } - - if( i==0 ) - { - vop::print(dataM,3,"%f "); + for(unsigned i=0; i { } { } ... { } } + + Note that if a column's data has a fixed size then the is empty. + + */ + + namespace wtr + { + typedef handle handle_t; + + rc_t create( handle_t& h, const char* fn ); + rc_t destroy( handle_t& h ); + + // Define the shape of each column. Set variable length dimensions to 0. + rc_t define_columns( handle_t h, const char* label, unsigned columnId, unsigned rankN, const unsigned* dimV ); + + // Cache one column of data which will then be written on the call to write_record(). + // If all the dimensions are defined in the column configuration then set dimV to nullptr; + rc_t write( handle_t h, unsigned columnId, const int* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 ); + rc_t write( handle_t h, unsigned columnId, const float* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 ); + rc_t write( handle_t h, unsigned columnId, const double* dV, unsigned dN, const unsigned* dimV=nullptr, unsigned dimN=0 ); + + // Write the + rc_t write_record( handle_t h ); + + rc_t test( const object_t* cfg ); + + } + + namespace rdr + { + typedef handle handle_t; + + enum + { + kIntRdrFl = 0x01, + kFloatRdrFl = 0x02, + kDoubleRdrFl = 0x04 + }; + + typedef struct col_str + { + const char* label; // Unique column label + unsigned id; // Unique column id + unsigned typeId; // See k???RdrFl type flags + unsigned varDimN; // Count of variable sized dimensions. 0 if this is a fixed size column. + unsigned rankN; // Count of elements in dimV[] + unsigned* dimV; // dimV[rankN]. Dimensions with value zero are undefined and set per field. + unsigned eleN; // Size of current column value + unsigned* maxDimV; // maxDimV[rankN]. Maximum value for each dimension. Same as dimV[] + + variant::value_t max; // Max value of all data elements in this field + variant::value_t min; // Min value of all data elements in this field + + unsigned maxEleN; // Max. count of elements in any one field. + unsigned maxByteN; // Max. size of this field in bytes + + unsigned byteOffset; // Byte offset of the value of this field in the current record buffer. + unsigned byteN; // Size of this field in bytes. + } col_t; + + rc_t create( handle_t& h, const char* fn ); + rc_t destroy( handle_t& h ); + + unsigned column_count( handle_t h ); + const col_t* column_cfg( handle_t h, unsigned colIdx ); + const col_t* column_cfg( handle_t h, const char* colLabel ); + + unsigned record_count( handle_t h); + + unsigned cur_record_index( handle_t h ); + unsigned next_record_index( handle_t h ); + + enum { + kOkState, // Normal state + kErrorState, // An error has occurred which render the rdr unusable. + kEofState // The end of the file has been encountered. + }; + + unsigned state( handle_t h ); + + rc_t seek( handle_t h, unsigned recordIdx ); + + // Read the next record. + rc_t read( handle_t h, unsigned recordIdx=kInvalidIdx ); + + // Read a column value. + // + // vRef = Pointer to the value vector. + // nRef = Count of elements in value vector. + // dimVRef = Dimension vector. nRef = cumprod(dimVRef) + rc_t get( handle_t h, unsigned columnId, const int*& vRef, unsigned& nRef, const unsigned*& dimVRef ); + rc_t get( handle_t h, unsigned columnId, const float*& vRef, unsigned& nRef, const unsigned*& dimVRef ); + rc_t get( handle_t h, unsigned columnId, const double*& vRef, unsigned& nRef, const unsigned*& dimVRef ); + + rc_t report( handle_t h ); + + rc_t test( const object_t* cfg ); + } + + namespace adapter + { + typedef handle handle_t; + + enum { + kPreInitState, + kInitState, + kEofState, + kErrorState + }; + + enum { + + kTrackColDimFl = 0x01, + + kIntFl = 0x10, // Field Type Flags: int + kFloatFl = 0x20, // float + kDoubleFl = 0x40, // double + kTypeMask = 0x70 // (int | float | double) + }; + + typedef struct colMap_str + { + unsigned colId; // Column identifier from the rdr + unsigned fieldEleOffset; // Offset into field record of this column + unsigned eleN; // Count of elements in this column + const unsigned* dimV; // Shape of this column + unsigned rankN; // dimV[ rankN ] Rank of this column + } colMap_t; + + + rc_t create( handle_t& hRef, const char* fn, unsigned maxBatchN ); + rc_t destroy( handle_t& hRef ); + + // Create a field and assign it a column. + rc_t create_field( handle_t h, unsigned fieldId, unsigned flags, const char* colLabel=nullptr, bool oneHotFl=false ); + + // Assign an additional column to a field + rc_t assign_column( handle_t h, unsigned fieldId, const char* colLabel, bool oneHotFl=false ); + + // Total count of records in the dataset. + unsigned record_count( handle_t h ); + + // Field element count for fixed size fields. + unsigned field_fixed_ele_count( handle_t h, unsigned fieldId ); + + // Read and cache batchN records. + // recordIdxV[ batchN ] is an optional array of record indexes + rc_t read( handle_t h, unsigned batchN, const unsigned* recordIdxV=nullptr ); + + // Return field vectors formed on the previous call to read(). + // fV[ eleN, batchN ] + // fNV[ batchN ] = eleN for each column of vV[] + rc_t get( handle_t h, unsigned fieldId, const int*& fV_Ref, const unsigned*& fNV_Ref ); + rc_t get( handle_t h, unsigned fieldId, const float*& fV_Ref, const unsigned*& fNV_Ref ); + rc_t get( handle_t h, unsigned fieldId, const double*& fV_Ref, const unsigned*& fNV_Ref ); + + // Returns col position and geometry data from each record returned by the last + // call to read(). + // Returns colMapV_Ref[batchN][columnN]. + rc_t column_map( handle_t h, unsigned fieldId, colMap_t const * const *& colMapV_Ref ); + + // See k???State above for return values. + unsigned state( handle_t h ); + + // Print a field to stdout. If fmt==nullptr then a format is automatically set based on the data type. + rc_t print_field( handle_t h, unsigned fieldId, const char* fmt=nullptr ); + + rc_t test( const object_t* cfg ); + + } + + namespace mnist { typedef handle handle_t; - rc_t create( handle_t& h, const char* dir ); + rc_t create( handle_t& h, const char* inDir ); rc_t destroy( handle_t& h ); - // Each column has one example image. - // The top row contains the example label. - const mtx::f_t* train( handle_t h ); - const mtx::f_t* validate( handle_t h ); - const mtx::f_t* test( handle_t h ); + unsigned record_count( handle_t h ); + + rc_t seek( handle_t h, unsigned exampleIdx ); + rc_t dataM( handle_t h, const float*& dataM, const unsigned*& labelV, unsigned exampleN, unsigned& actualExampleN_Ref, unsigned exampleIdx=kInvalidIdx ); - rc_t test(const char* dir, const char* imageFn ); + rc_t write( handle_t h, const char* fn ); + + rc_t test( const object_t* cfg ); } - - - - - typedef handle handle_t; - - // Data subset flags - enum { kTrainSsFl=0x10, kValidSsFl=0x20, kTestSsFl=0x40 }; - - enum { kFloatFl=0x02, kDoubleFl=0x04 }; - rc_t create( handle_t& h, const object_t* cfg, unsigned flags ); - rc_t destroy( handle_t& h ); - - - // Load a dataset, divide it into train,validate, and test subsets - rc_t load( handle_t h, const char* dsLabel, unsigned batchN, unsigned validPct, unsigned testPct, unsigned flags ); - - // Shuffle the subset. - rc_t shuffle( handle_t h, unsigned subsetFl ); - - // Get the dimensions of all the examples from a subset. - // dimN=1: dimV[0]=batchN - // dimN=2: dimV[0]=realN dimV[1]=batchN - // dimN=3: dimV[0,1]=realN dimV[2]=batchN - rc_t subset_dims( handle_t h, unsigned subsetFl, const unsigned*& dimV_Ref, unsigned& dimN_Ref ); - rc_t label_dims( handle_t h, unsigned subsetFl, const unsigned*& dimV_Ref, unsigned& dimN_Ref ); - - - // get the next batch. Returns nullptr at the end of an epoch. - rc_t batch_f( handle_t h, unsigned subsetFl, const float*& dataM_Ref, const float*& labelM_Ref ); - rc_t batch_d( handle_t h, unsigned subsetFl, const double*& dataM_Ref, const double*& labelM_Ref ); - rc_t test( const object_t* cfg ); - + }