//| Copyright: (C) 2020-2024 Kevin Larke //| License: GNU GPL version 3.0 or above. See the accompanying LICENSE file. #include "cwCommon.h" #include "cwLog.h" #include "cwCommonImpl.h" #include "cwMem.h" #include "cwFile.h" #include "cwTest.h" #include "cwObject.h" #include "cwLex.h" namespace cw { namespace lex { enum { kRealFloatLexFl = 0x01, kIntUnsignedLexFl = 0x02 }; struct lex_str; typedef unsigned (*lexMatcherFuncPtr_t)( struct lex_str* p, const char* cp, unsigned cn, const char* keyStr ); // token match function record typedef struct { unsigned typeId; // token type this matcher recognizes lexMatcherFuncPtr_t funcPtr; // recognizer function (only used if userPtr==nullptr) char* tokenStr; // fixed string data used by the recognizer (only used if userPtr==nullptr) lexUserMatcherPtr_t userPtr; // user defined recognizer function (only used if funcPtr==nullptr) bool enableFl; // true if this matcher is enabled } lexMatcher; typedef struct lex_str { const char* cp; // character buffer unsigned cn; // count of characters in buffer unsigned ci; // current buffer index position unsigned flags; // lexer control flags unsigned curTokenId; // type id of the current token unsigned curTokenCharIdx; // index into cp[] of the current token unsigned curTokenCharCnt; // count of characters in the current token unsigned curLine; // line number of the current token unsigned curCol; // column number of the current token unsigned nextLine; unsigned nextCol; char* blockBegCmtStr; char* blockEndCmtStr; char* lineCmtStr; lexMatcher* mfp; // base of matcher array unsigned mfi; // next available matcher array slot unsigned mfn; // count of elementes in mfp[] char* textBuf; // text buf used by lexSetFile() unsigned attrFlags; // used to store the int and real suffix type flags unsigned lastRC; } lex_t; lex_t* _handleToPtr(handle_t h){ return handleToPtr(h); } bool _lexIsNewline( char c ) { return c == '\n'; } bool _lexIsCommentTypeId( unsigned typeId ) { return typeId == kBlockCmtLexTId || typeId == kLineCmtLexTId; } // Locate 'keyStr' in cp[cn] and return the index into cp[cn] of the character // following the last char in 'keyStr'. If keyStr is not found return kInvalidIdx. unsigned _lexScanTo( const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; unsigned n = strlen(keyStr); if( n <= cn ) for(; i<=cn-n; ++i) if( strncmp(cp + i, keyStr, n ) == 0 ) return i+n; return kInvalidIdx; } unsigned _lexExactStringMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned n = strlen(keyStr); return strncmp(keyStr,cp,n) == 0 ? n : 0; } unsigned _lexSpaceMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i=0; for(; i0 && i 0; } // if at least one digit was found if( d>0 ) { // Note that this path allows a string w/o a decimal pt to trigger a match. if(iattrFlags = cwSetFlag(p->attrFlags,kRealFloatLexFl); ++i; break; } } // match w/o suffix return if( d>0 && (fl || n==1 || cwIsFlag(p->attrFlags,kRealFloatLexFl)) ) return i; } return 0; // no-match return } unsigned _lexIntMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; bool signFl = false; unsigned digitCnt = 0; for(; i= number of // digits following the decimal point (in effect zeros are // padded on the right side) then the value is an integer. // // The current implementation recognizes all numeric strings // containing a decimal point as reals. // if no integer was found if( digitCnt==0) return 0; // check for suffix if(icurLine); else { p->attrFlags = cwSetFlag(p->attrFlags,kIntUnsignedLexFl); ++i; } break; default: break; } } return i; } unsigned _lexHexMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; if( cn < 3 ) return 0; if( cp[0]=='0' && cp[1]=='x') for(i=2; icurLine); } unsigned _lexQCharMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; if( i >= cn || cp[i]!='\'' ) return 0; i+=2; if( i >= cn || cp[i]!='\'') return 0; return 3; } unsigned _lexBlockCmtMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned n = strlen(p->blockBegCmtStr); if( strncmp( p->blockBegCmtStr, cp, n ) == 0 ) { unsigned i; if((i = _lexScanTo(cp + n, cn-n,p->blockEndCmtStr)) == kInvalidIdx ) { cwLogError(kSyntaxErrorRC, "Missing end of block comment on line:%i.", p->curLine); return 0; } return n + i; } return 0; } unsigned _lexLineCmtMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned n = strlen(p->lineCmtStr); if( strncmp( p->lineCmtStr, cp, n ) == 0) { unsigned i; const char newlineStr[] = "\n"; if((i = _lexScanTo(cp + n, cn-n, newlineStr)) == kInvalidIdx ) { // no EOL was found so the comment must be on the last line of the source return cn; } return n + i; } return 0; } rc_t _lexInstallMatcher( lex_t* p, unsigned typeId, lexMatcherFuncPtr_t funcPtr, const char* keyStr, lexUserMatcherPtr_t userPtr ) { assert( funcPtr==nullptr || userPtr==nullptr ); assert( !(funcPtr==nullptr && userPtr==nullptr)); // if there is no space in the user token array - then expand it if( p->mfi == p->mfn ) { int incr_cnt = 10; lexMatcher* np = mem::allocZ( p->mfn + incr_cnt ); memcpy(np,p->mfp,p->mfi*sizeof(lexMatcher)); mem::release(p->mfp); p->mfp = np; p->mfn += incr_cnt; } p->mfp[p->mfi].tokenStr = nullptr; p->mfp[p->mfi].typeId = typeId; p->mfp[p->mfi].funcPtr = funcPtr; p->mfp[p->mfi].userPtr = userPtr; p->mfp[p->mfi].enableFl = true; if( keyStr != nullptr ) { // allocate space for the token string and store it p->mfp[p->mfi].tokenStr = mem::duplStr(keyStr); } p->mfi++; return kOkRC; } rc_t _lexReset( lex_t* p ) { p->ci = 0; p->curTokenId = kErrorLexTId; p->curTokenCharIdx = kInvalidIdx; p->curTokenCharCnt = 0; p->curLine = 0; p->curCol = 0; p->nextLine = 0; p->nextCol = 0; p->lastRC = kOkRC; return kOkRC; } rc_t _lexSetTextBuffer( lex_t* p, const char* cp, unsigned cn ) { p->cp = cp; p->cn = cn; return _lexReset(p); } lexMatcher* _lexFindUserToken( lex_t* p, unsigned id, const char* tokenStr ) { unsigned i = 0; for(; imfi; ++i) { if( id != kInvalidId && p->mfp[i].typeId == id ) return p->mfp + i; if( p->mfp[i].tokenStr != nullptr && tokenStr != nullptr && strcmp(p->mfp[i].tokenStr,tokenStr)==0 ) return p->mfp + i; } return nullptr; } } } // namespace cw cw::rc_t cw::lex::create( handle_t& hRef, const char* cp, unsigned cn, unsigned flags ) { rc_t rc = kOkRC; char dfltLineCmt[] = "//"; char dfltBlockBegCmt[] = "/*"; char dfltBlockEndCmt[] = "*/"; lex_t* p = nullptr; if((rc = lex::destroy(hRef)) != kOkRC ) return rc; p = mem::allocZ(); p->flags = flags; _lexSetTextBuffer( p, cp, cn ); int init_mfn = 10; p->mfp = mem::allocZ( init_mfn ); p->mfn = init_mfn; p->mfi = 0; p->lineCmtStr = mem::duplStr( dfltLineCmt ); p->blockBegCmtStr = mem::duplStr( dfltBlockBegCmt ); p->blockEndCmtStr = mem::duplStr( dfltBlockEndCmt ); _lexInstallMatcher( p, kSpaceLexTId, _lexSpaceMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kRealLexTId, _lexRealMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kIntLexTId, _lexIntMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kHexLexTId, _lexHexMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kIdentLexTId, _lexIdentMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kQStrLexTId, _lexQStrMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kBlockCmtLexTId, _lexBlockCmtMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kLineCmtLexTId, _lexLineCmtMatcher, nullptr, nullptr ); if( cwIsFlag(flags,kReturnQCharLexFl) ) _lexInstallMatcher( p, kQCharLexTId, _lexQCharMatcher, nullptr, nullptr ); hRef.set(p); _lexReset(p); return rc; } cw::rc_t cw::lex::destroy( handle_t& hRef ) { if( hRef.isValid() == false ) return kOkRC; lex_t* p = _handleToPtr(hRef); if( p != nullptr ) { if( p->mfp != nullptr ) { unsigned i = 0; // free the user token strings for(; imfi; ++i) if( p->mfp[i].tokenStr != nullptr ) mem::release(p->mfp[i].tokenStr); // free the matcher array mem::release(p->mfp); p->mfi = 0; p->mfn = 0; } mem::release(p->lineCmtStr); mem::release(p->blockBegCmtStr); mem::release(p->blockEndCmtStr); mem::release(p->textBuf); // free the lexer object mem::release(p); hRef.set(nullptr); } return kOkRC; } cw::rc_t cw::lex::reset( handle_t h ) { lex_t* p = _handleToPtr(h); return _lexReset(p); } bool cw::lex::isValid( handle_t h ) { return h.isValid(); } cw::rc_t cw::lex::setTextBuffer( handle_t h, const char* cp, unsigned cn ) { lex_t* p = _handleToPtr(h); return _lexSetTextBuffer(p,cp,cn); } cw::rc_t cw::lex::setFile( handle_t h, const char* fn ) { rc_t rc = kOkRC; file::handle_t fh; lex_t* p = _handleToPtr(h); long n = 0; assert( fn != nullptr && p != nullptr ); // open the file if((rc = file::open(fh,fn,file::kReadFl)) != kOkRC ) return rc; // seek to the end of the file if((rc = file::seek(fh,file::kEndFl,0)) != kOkRC ) return rc; // get the length of the file if((rc = file::tell(fh,&n)) != kOkRC ) return rc; // rewind to the beginning of the file if((rc = file::seek(fh,file::kBeginFl,0)) != kOkRC ) return rc; // allocate the text buffer if((p->textBuf = mem::resizeZ(p->textBuf, n+1)) == nullptr ) { rc = cwLogError(kMemAllocFailRC,"Unable to allocate the text file buffer for:'%s'.",fn); goto errLabel; } // read the file into the buffer if((rc = file::read(fh,p->textBuf,n)) != kOkRC ) return rc; if((rc = _lexSetTextBuffer( p, p->textBuf, n )) != kOkRC ) goto errLabel; errLabel: // close the file rc_t rc0 = file::close(fh); if(rc != kOkRC ) return rc; return rc0; } cw::rc_t cw::lex::registerToken( handle_t h, unsigned id, const char* tokenStr ) { lex_t* p = _handleToPtr(h); // prevent duplicate tokens if( _lexFindUserToken( p, id, tokenStr ) != nullptr ) return cwLogError( kInvalidArgRC, "id:%i token:%s duplicates the token string or id", id, tokenStr ); return _lexInstallMatcher( p, id, _lexExactStringMatcher, tokenStr, nullptr ); } cw::rc_t cw::lex::registerMatcher( handle_t h, unsigned id, lexUserMatcherPtr_t userPtr ) { lex_t* p = _handleToPtr(h); // prevent duplicate tokens if( _lexFindUserToken( p, id, nullptr ) != nullptr ) return cwLogError(kInvalidArgRC, "A token matching function has already been installed for token id: %i", id ); return _lexInstallMatcher( p, id, nullptr, nullptr, userPtr ); } cw::rc_t cw::lex::enableToken( handle_t h, unsigned id, bool enableFl ) { lex_t* p = _handleToPtr(h); unsigned mi = 0; for(; mimfi; ++mi) if( p->mfp[mi].typeId == id ) { p->mfp[mi].enableFl = enableFl; return kOkRC; } return cwLogError( kInvalidArgRC, "%i is not a valid token type id.",id); } unsigned cw::lex::filterFlags( handle_t h ) { lex_t* p = _handleToPtr(h); return p->flags; } void cw::lex::setFilterFlags( handle_t h, unsigned flags ) { lex_t* p = _handleToPtr(h); p->flags = flags; } unsigned cw::lex::getNextToken( handle_t h ) { lex_t* p = _handleToPtr(h); if( p->lastRC != kOkRC ) return kErrorLexTId; while( p->ci < p->cn ) { unsigned i; unsigned mi = 0; unsigned maxCharCnt = 0; unsigned maxIdx = kInvalidIdx; p->curTokenId = kErrorLexTId; p->curTokenCharIdx = kInvalidIdx; p->curTokenCharCnt = 0; p->attrFlags = 0; // try each matcher for(; mimfi; ++mi) if( p->mfp[mi].enableFl ) { unsigned charCnt = 0; if( p->mfp[mi].funcPtr != nullptr ) charCnt = p->mfp[mi].funcPtr(p, p->cp + p->ci, p->cn - p->ci, p->mfp[mi].tokenStr ); else charCnt = p->mfp[mi].userPtr( p->cp + p->ci, p->cn - p->ci); // notice if the matcher set the error code if( p->lastRC != kOkRC ) return kErrorLexTId; // if this matched token is longer then the prev. matched token or // if the prev matched token was an identifier and this matched token is an equal length user defined token if( (charCnt > maxCharCnt) || (charCnt>0 && charCnt==maxCharCnt && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId ) || (charCnt>0 && charCntmfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId && cwIsFlag(p->flags,kUserDefPriorityLexFl)) ) { maxCharCnt = charCnt; maxIdx = mi; } } // no token was matched if( maxIdx == kInvalidIdx ) { if( cwIsFlag(p->flags,kReturnUnknownLexFl) ) { maxCharCnt = 1; } else { cwLogError( kSyntaxErrorRC, "Unable to recognize token:'%c' on line %i.",*(p->cp+p->ci), p->curLine); return kErrorLexTId; } } // update the current line and column position p->curLine = p->nextLine; p->curCol = p->nextCol; // find the next column and line position for(i=0; icp[ p->ci + i ]) ) { p->nextLine++; p->nextCol = 1; } else p->nextCol++; } bool returnFl = true; if( maxIdx != kInvalidIdx ) { // check the space token filter if( (p->mfp[ maxIdx ].typeId == kSpaceLexTId) && (cwIsFlag(p->flags,kReturnSpaceLexFl)==0) ) returnFl = false; // check the comment token filter if( _lexIsCommentTypeId(p->mfp[ maxIdx ].typeId) && (cwIsFlag(p->flags,kReturnCommentsLexFl)==0) ) returnFl = false; } // update the lexer state p->curTokenId = maxIdx==kInvalidIdx ? (unsigned)kUnknownLexTId : p->mfp[ maxIdx ].typeId; p->curTokenCharIdx = p->ci; p->curTokenCharCnt = maxCharCnt; // advance the text buffer p->ci += maxCharCnt; if( returnFl ) return p->curTokenId; } p->lastRC = kEofRC; return kEofLexTId; } unsigned cw::lex::tokenId( handle_t h ) { lex_t* p = _handleToPtr(h); return p->curTokenId; } const char* cw::lex::tokenText( handle_t h ) { lex_t* p = _handleToPtr(h); if( p->curTokenCharIdx == kInvalidIdx ) return nullptr; unsigned n = p->curTokenId == kQStrLexTId ? 1 : 0; return p->cp + p->curTokenCharIdx + n; } unsigned cw::lex::tokenCharCount( handle_t h ) { lex_t* p = _handleToPtr(h); if( p->curTokenCharIdx == kInvalidIdx ) return 0; unsigned n = p->curTokenId == kQStrLexTId ? 2 : 0; return p->curTokenCharCnt - n; } int cw::lex::tokenInt( handle_t h ) { return strtol( lex::tokenText(h),nullptr,0 ); } unsigned cw::lex::tokenUInt( handle_t h ) { return strtol( lex::tokenText(h),nullptr,0 ); } float cw::lex::tokenFloat( handle_t h ) { return strtof( lex::tokenText(h),nullptr ); } double cw::lex::tokenDouble( handle_t h ) { return strtod( lex::tokenText(h),nullptr ); } bool cw::lex::tokenIsUnsigned( handle_t h ) { lex_t* p = _handleToPtr(h); return p->curTokenId == kIntLexTId && cwIsFlag(p->attrFlags,kIntUnsignedLexFl); } bool cw::lex::tokenIsSinglePrecision( handle_t h ) { lex_t* p = _handleToPtr(h); return p->curTokenId == kRealLexTId && cwIsFlag(p->attrFlags,kRealFloatLexFl); } unsigned cw::lex::currentLineNumber( handle_t h ) { lex_t* p = _handleToPtr(h); return p->curLine + 1; } unsigned cw::lex::currentColumnNumber( handle_t h ) { lex_t* p = _handleToPtr(h); return p->curCol + 1; } unsigned cw::lex::errorRC( handle_t h ) { lex_t* p = _handleToPtr(h); return p->lastRC; } const char* cw::lex::idToLabel( handle_t h, unsigned typeId ) { lex_t* p = _handleToPtr(h); switch( typeId ) { case kErrorLexTId: return ""; case kEofLexTId: return ""; case kSpaceLexTId: return ""; case kRealLexTId: return ""; case kIntLexTId: return ""; case kHexLexTId: return ""; case kIdentLexTId: return ""; case kQStrLexTId: return ""; case kBlockCmtLexTId: return ""; case kLineCmtLexTId: return ""; default: { lexMatcher* mp; if((mp = _lexFindUserToken(p,typeId,nullptr)) == nullptr ) return ""; return mp->tokenStr; } } return ""; } namespace cw { namespace lex { //{ { label:cwLexEx } //( // lexTest() gives a simple 'lex' example. //) //( rc_t test( const test::test_args_t& args ) { rc_t rc = kOkRC; unsigned tid = kInvalidId; handle_t h; char buf[] = "123ident0\n 123.456\nident0\n" "0xa12+.2\n" " // comment \n" "/* block \n" "comment */" "\"quoted string\"" "ident1 " "1234.56f" "345u" " // last line comment"; // initialize a lexer with a buffer of text if((rc = lex::create(h,buf,strlen(buf), kReturnSpaceLexFl | kReturnCommentsLexFl)) != kOkRC ) { rc = cwLogError(rc,"Lexer initialization failed."); goto errLabel; } // register some additional recoginizers lex::registerToken(h,kUserLexTId+1,"+"); lex::registerToken(h,kUserLexTId+2,"-"); // ask for token id's while( (tid = lex::getNextToken(h)) != kEofLexTId ) { // print information about each token cwLogInfo("ln:%i col:%i tok:%s '%.*s' len:%i ", lex::currentLineNumber(h), lex::currentColumnNumber(h), lex::idToLabel(h,tid), lex::tokenCharCount(h), lex::tokenText(h) , lex::tokenCharCount(h)); // if the token is a number ... if( tid==kIntLexTId || tid==kRealLexTId || tid==kHexLexTId ) { // ... then request the numbers value int iv = lex::tokenInt(h); double dv = lex::tokenDouble(h); cwLogInfo("Number: int:%i dbl:%f unsigned:%i float:%i",iv,dv,tokenIsUnsigned(h),tokenIsSinglePrecision(h)); } // handle errors if( tid == kErrorLexTId ) { cwLogInfo("Error:%i\n", lex::errorRC(h)); break; } } errLabel: // finalize the lexer lex::destroy(h); return rc; } } } //) //}