#include "cwCommon.h" #include "cwLog.h" #include "cwCommonImpl.h" #include "cwMem.h" #include "cwFile.h" #include "cwLex.h" namespace cw { enum { kRealFloatLexFl = 0x01, kIntUnsignedLexFl = 0x02 }; struct lex_str; typedef unsigned (*lexMatcherFuncPtr_t)( struct lex_str* p, const char* cp, unsigned cn, const char* keyStr ); // token match function record typedef struct { unsigned typeId; // token type this matcher recognizes lexMatcherFuncPtr_t funcPtr; // recognizer function (only used if userPtr==nullptr) char* tokenStr; // fixed string data used by the recognizer (only used if userPtr==nullptr) lexUserMatcherPtr_t userPtr; // user defined recognizer function (only used if funcPtr==nullptr) bool enableFl; // true if this matcher is enabled } lexMatcher; typedef struct lex_str { const char* cp; // character buffer unsigned cn; // count of characters in buffer unsigned ci; // current buffer index position unsigned flags; // lexer control flags unsigned curTokenId; // type id of the current token unsigned curTokenCharIdx; // index into cp[] of the current token unsigned curTokenCharCnt; // count of characters in the current token unsigned curLine; // line number of the current token unsigned curCol; // column number of the current token unsigned nextLine; unsigned nextCol; char* blockBegCmtStr; char* blockEndCmtStr; char* lineCmtStr; lexMatcher* mfp; // base of matcher array unsigned mfi; // next available matcher array slot unsigned mfn; // count of elementes in mfp[] char* textBuf; // text buf used by lexSetFile() unsigned attrFlags; // used to store the int and real suffix type flags unsigned lastRC; } lex_t; lexH_t lexNullHandle; #define _lexHandleToPtr(h) handleToPtr(h) bool _lexIsNewline( char c ) { return c == '\n'; } bool _lexIsCommentTypeId( unsigned typeId ) { return typeId == kBlockCmtLexTId || typeId == kLineCmtLexTId; } // Locate 'keyStr' in cp[cn] and return the index into cp[cn] of the character // following the last char in 'keyStr'. If keyStr is not found return kInvalidIdx. unsigned _lexScanTo( const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; unsigned n = strlen(keyStr); if( n <= cn ) for(; i<=cn-n; ++i) if( strncmp(cp + i, keyStr, n ) == 0 ) return i+n; return kInvalidIdx; } unsigned _lexExactStringMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned n = strlen(keyStr); return strncmp(keyStr,cp,n) == 0 ? n : 0; } unsigned _lexSpaceMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i=0; for(; i0 && i 0; } // if at least one digit was found if( d>0 ) { // Note that this path allows a string w/o a decimal pt to trigger a match. if(iattrFlags = cwSetFlag(p->attrFlags,kRealFloatLexFl); ++i; break; } } // match w/o suffix return if( d>0 && (fl || n==1 || cwIsFlag(p->attrFlags,kRealFloatLexFl)) ) return i; } return 0; // no-match return } unsigned _lexIntMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; bool signFl = false; unsigned digitCnt = 0; for(; i= number of // digits following the decimal point (in effect zeros are // padded on the right side) then the value is an integer. // // The current implementation recognizes all numeric strings // containing a decimal point as reals. // if no integer was found if( digitCnt==0) return 0; // check for suffix if(icurLine); else { p->attrFlags = cwSetFlag(p->attrFlags,kIntUnsignedLexFl); ++i; } break; default: break; } } return i; } unsigned _lexHexMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; if( cn < 3 ) return 0; if( cp[0]=='0' && cp[1]=='x') for(i=2; icurLine); } unsigned _lexQCharMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned i = 0; if( i >= cn || cp[i]!='\'' ) return 0; i+=2; if( i >= cn || cp[i]!='\'') return 0; return 3; } unsigned _lexBlockCmtMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned n = strlen(p->blockBegCmtStr); if( strncmp( p->blockBegCmtStr, cp, n ) == 0 ) { unsigned i; if((i = _lexScanTo(cp + n, cn-n,p->blockEndCmtStr)) == kInvalidIdx ) { cwLogError(kSyntaxErrorRC, "Missing end of block comment on line:%i.", p->curLine); return 0; } return n + i; } return 0; } unsigned _lexLineCmtMatcher( lex_t* p, const char* cp, unsigned cn, const char* keyStr ) { unsigned n = strlen(p->lineCmtStr); if( strncmp( p->lineCmtStr, cp, n ) == 0) { unsigned i; const char newlineStr[] = "\n"; if((i = _lexScanTo(cp + n, cn-n, newlineStr)) == kInvalidIdx ) { // no EOL was found so the comment must be on the last line of the source return cn; } return n + i; } return 0; } rc_t _lexInstallMatcher( lex_t* p, unsigned typeId, lexMatcherFuncPtr_t funcPtr, const char* keyStr, lexUserMatcherPtr_t userPtr ) { assert( funcPtr==nullptr || userPtr==nullptr ); assert( !(funcPtr==nullptr && userPtr==nullptr)); // if there is no space in the user token array - then expand it if( p->mfi == p->mfn ) { int incr_cnt = 10; lexMatcher* np = memAllocZ( p->mfn + incr_cnt ); memcpy(np,p->mfp,p->mfi*sizeof(lexMatcher)); memRelease(p->mfp); p->mfp = np; p->mfn += incr_cnt; } p->mfp[p->mfi].tokenStr = nullptr; p->mfp[p->mfi].typeId = typeId; p->mfp[p->mfi].funcPtr = funcPtr; p->mfp[p->mfi].userPtr = userPtr; p->mfp[p->mfi].enableFl = true; if( keyStr != nullptr ) { // allocate space for the token string and store it p->mfp[p->mfi].tokenStr = memDuplStr(keyStr); } p->mfi++; return kOkRC; } rc_t _lexReset( lex_t* p ) { p->ci = 0; p->curTokenId = kErrorLexTId; p->curTokenCharIdx = kInvalidIdx; p->curTokenCharCnt = 0; p->curLine = 0; p->curCol = 0; p->nextLine = 0; p->nextCol = 0; p->lastRC = kOkRC; return kOkRC; } rc_t _lexSetTextBuffer( lex_t* p, const char* cp, unsigned cn ) { p->cp = cp; p->cn = cn; return _lexReset(p); } lexMatcher* _lexFindUserToken( lex_t* p, unsigned id, const char* tokenStr ) { unsigned i = 0; for(; imfi; ++i) { if( id != kInvalidId && p->mfp[i].typeId == id ) return p->mfp + i; if( p->mfp[i].tokenStr != nullptr && tokenStr != nullptr && strcmp(p->mfp[i].tokenStr,tokenStr)==0 ) return p->mfp + i; } return nullptr; } } // namespace cw cw::rc_t cw::lexCreate( lexH_t& hRef, const char* cp, unsigned cn, unsigned flags ) { rc_t rc = kOkRC; char dfltLineCmt[] = "//"; char dfltBlockBegCmt[] = "/*"; char dfltBlockEndCmt[] = "*/"; lex_t* p = nullptr; if((rc = lexDestroy(hRef)) != kOkRC ) return rc; p = memAllocZ(); p->flags = flags; _lexSetTextBuffer( p, cp, cn ); int init_mfn = 10; p->mfp = memAllocZ( init_mfn ); p->mfn = init_mfn; p->mfi = 0; p->lineCmtStr = memDuplStr( dfltLineCmt ); p->blockBegCmtStr = memDuplStr( dfltBlockBegCmt ); p->blockEndCmtStr = memDuplStr( dfltBlockEndCmt ); _lexInstallMatcher( p, kSpaceLexTId, _lexSpaceMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kRealLexTId, _lexRealMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kIntLexTId, _lexIntMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kHexLexTId, _lexHexMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kIdentLexTId, _lexIdentMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kQStrLexTId, _lexQStrMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kBlockCmtLexTId, _lexBlockCmtMatcher, nullptr, nullptr ); _lexInstallMatcher( p, kLineCmtLexTId, _lexLineCmtMatcher, nullptr, nullptr ); if( cwIsFlag(flags,kReturnQCharLexFl) ) _lexInstallMatcher( p, kQCharLexTId, _lexQCharMatcher, nullptr, nullptr ); hRef.set(p); _lexReset(p); return rc; } cw::rc_t cw::lexDestroy( lexH_t& hRef ) { if( hRef.isValid() == false ) return kOkRC; lex_t* p = _lexHandleToPtr(hRef); if( p != nullptr ) { if( p->mfp != nullptr ) { unsigned i = 0; // free the user token strings for(; imfi; ++i) if( p->mfp[i].tokenStr != nullptr ) memRelease(p->mfp[i].tokenStr); // free the matcher array memRelease(p->mfp); p->mfi = 0; p->mfn = 0; } memRelease(p->lineCmtStr); memRelease(p->blockBegCmtStr); memRelease(p->blockEndCmtStr); memRelease(p->textBuf); // free the lexer object memRelease(p); hRef.set(nullptr); } return kOkRC; } cw::rc_t cw::lexReset( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return _lexReset(p); } bool cw::lexIsValid( lexH_t h ) { return h.isValid(); } cw::rc_t cw::lexSetTextBuffer( lexH_t h, const char* cp, unsigned cn ) { lex_t* p = _lexHandleToPtr(h); return _lexSetTextBuffer(p,cp,cn); } cw::rc_t cw::lexSetFile( lexH_t h, const char* fn ) { rc_t rc = kOkRC; fileH_t fh; lex_t* p = _lexHandleToPtr(h); long n = 0; assert( fn != nullptr && p != nullptr ); // open the file if((rc = fileOpen(fh,fn,kReadFileFl)) != kOkRC ) return rc; // seek to the end of the file if((rc = fileSeek(fh,kEndFileFl,0)) != kOkRC ) return rc; // get the length of the file if((rc = fileTell(fh,&n)) != kOkRC ) return rc; // rewind to the beginning of the file if((rc = fileSeek(fh,kBeginFileFl,0)) != kOkRC ) return rc; // allocate the text buffer if((p->textBuf = memResizeZ(p->textBuf, n+1)) == nullptr ) { rc = cwLogError(kMemAllocFailRC,"Unable to allocate the text file buffer for:'%s'.",fn); goto errLabel; } // read the file into the buffer if((rc = fileRead(fh,p->textBuf,n)) != kOkRC ) return rc; if((rc = _lexSetTextBuffer( p, p->textBuf, n )) != kOkRC ) goto errLabel; errLabel: // close the file rc_t rc0 = fileClose(fh); if(rc != kOkRC ) return rc; return rc0; } cw::rc_t cw::lexRegisterToken( lexH_t h, unsigned id, const char* tokenStr ) { lex_t* p = _lexHandleToPtr(h); // prevent duplicate tokens if( _lexFindUserToken( p, id, tokenStr ) != nullptr ) return cwLogError( kInvalidArgRC, "id:%i token:%s duplicates the token string or id", id, tokenStr ); return _lexInstallMatcher( p, id, _lexExactStringMatcher, tokenStr, nullptr ); } cw::rc_t cw::lexRegisterMatcher( lexH_t h, unsigned id, lexUserMatcherPtr_t userPtr ) { lex_t* p = _lexHandleToPtr(h); // prevent duplicate tokens if( _lexFindUserToken( p, id, nullptr ) != nullptr ) return cwLogError(kInvalidArgRC, "A token matching function has already been installed for token id: %i", id ); return _lexInstallMatcher( p, id, nullptr, nullptr, userPtr ); } cw::rc_t cw::lexEnableToken( lexH_t h, unsigned id, bool enableFl ) { lex_t* p = _lexHandleToPtr(h); unsigned mi = 0; for(; mimfi; ++mi) if( p->mfp[mi].typeId == id ) { p->mfp[mi].enableFl = enableFl; return kOkRC; } return cwLogError( kInvalidArgRC, "%i is not a valid token type id.",id); } unsigned cw::lexFilterFlags( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->flags; } void cw::lexSetFilterFlags( lexH_t h, unsigned flags ) { lex_t* p = _lexHandleToPtr(h); p->flags = flags; } unsigned cw::lexGetNextToken( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); if( p->lastRC != kOkRC ) return kErrorLexTId; while( p->ci < p->cn ) { unsigned i; unsigned mi = 0; unsigned maxCharCnt = 0; unsigned maxIdx = kInvalidIdx; p->curTokenId = kErrorLexTId; p->curTokenCharIdx = kInvalidIdx; p->curTokenCharCnt = 0; p->attrFlags = 0; // try each matcher for(; mimfi; ++mi) if( p->mfp[mi].enableFl ) { unsigned charCnt = 0; if( p->mfp[mi].funcPtr != nullptr ) charCnt = p->mfp[mi].funcPtr(p, p->cp + p->ci, p->cn - p->ci, p->mfp[mi].tokenStr ); else charCnt = p->mfp[mi].userPtr( p->cp + p->ci, p->cn - p->ci); // notice if the matcher set the error code if( p->lastRC != kOkRC ) return kErrorLexTId; // if this matched token is longer then the prev. matched token or // if the prev matched token was an identifier and this matched token is an equal length user defined token if( (charCnt > maxCharCnt) || (charCnt>0 && charCnt==maxCharCnt && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId ) || (charCnt>0 && charCntmfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId && cwIsFlag(p->flags,kUserDefPriorityLexFl)) ) { maxCharCnt = charCnt; maxIdx = mi; } } // no token was matched if( maxIdx == kInvalidIdx ) { if( cwIsFlag(p->flags,kReturnUnknownLexFl) ) { maxCharCnt = 1; } else { cwLogError( kSyntaxErrorRC, "Unable to recognize token:'%c' on line %i.",*(p->cp+p->ci), p->curLine); return kErrorLexTId; } } // update the current line and column position p->curLine = p->nextLine; p->curCol = p->nextCol; // find the next column and line position for(i=0; icp[ p->ci + i ]) ) { p->nextLine++; p->nextCol = 1; } else p->nextCol++; } bool returnFl = true; if( maxIdx != kInvalidIdx ) { // check the space token filter if( (p->mfp[ maxIdx ].typeId == kSpaceLexTId) && (cwIsFlag(p->flags,kReturnSpaceLexFl)==0) ) returnFl = false; // check the comment token filter if( _lexIsCommentTypeId(p->mfp[ maxIdx ].typeId) && (cwIsFlag(p->flags,kReturnCommentsLexFl)==0) ) returnFl = false; } // update the lexer state p->curTokenId = maxIdx==kInvalidIdx ? kUnknownLexTId : p->mfp[ maxIdx ].typeId; p->curTokenCharIdx = p->ci; p->curTokenCharCnt = maxCharCnt; // advance the text buffer p->ci += maxCharCnt; if( returnFl ) return p->curTokenId; } p->lastRC = kEofRC; return kEofLexTId; } unsigned cw::lexTokenId( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->curTokenId; } const char* cw::lexTokenText( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); if( p->curTokenCharIdx == kInvalidIdx ) return nullptr; unsigned n = p->curTokenId == kQStrLexTId ? 1 : 0; return p->cp + p->curTokenCharIdx + n; } unsigned cw::lexTokenCharCount( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); if( p->curTokenCharIdx == kInvalidIdx ) return 0; unsigned n = p->curTokenId == kQStrLexTId ? 2 : 0; return p->curTokenCharCnt - n; } int cw::lexTokenInt( lexH_t h ) { return strtol( lexTokenText(h),nullptr,0 ); } unsigned cw::lexTokenUInt( lexH_t h ) { return strtol( lexTokenText(h),nullptr,0 ); } float cw::lexTokenFloat( lexH_t h ) { return strtof( lexTokenText(h),nullptr ); } double cw::lexTokenDouble( lexH_t h ) { return strtod( lexTokenText(h),nullptr ); } bool cw::lexTokenIsUnsigned( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->curTokenId == kIntLexTId && cwIsFlag(p->attrFlags,kIntUnsignedLexFl); } bool cw::lexTokenIsSinglePrecision( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->curTokenId == kRealLexTId && cwIsFlag(p->attrFlags,kRealFloatLexFl); } unsigned cw::lexCurrentLineNumber( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->curLine + 1; } unsigned cw::lexCurrentColumnNumber( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->curCol + 1; } unsigned cw::lexErrorRC( lexH_t h ) { lex_t* p = _lexHandleToPtr(h); return p->lastRC; } const char* cw::lexIdToLabel( lexH_t h, unsigned typeId ) { lex_t* p = _lexHandleToPtr(h); switch( typeId ) { case kErrorLexTId: return ""; case kEofLexTId: return ""; case kSpaceLexTId: return ""; case kRealLexTId: return ""; case kIntLexTId: return ""; case kHexLexTId: return ""; case kIdentLexTId: return ""; case kQStrLexTId: return ""; case kBlockCmtLexTId: return ""; case kLineCmtLexTId: return ""; default: { lexMatcher* mp; if((mp = _lexFindUserToken(p,typeId,nullptr)) == nullptr ) return ""; return mp->tokenStr; } } return ""; } namespace cw { //{ { label:cwLexEx } //( // lexTest() gives a simple 'lex' example. //) //( void lexTest() { rc_t rc = kOkRC; unsigned tid = kInvalidId; lexH_t h = lexNullHandle; char buf[] = "123ident0\n 123.456\nident0\n" "0xa12+.2\n" "// comment \n" "/* block \n" "comment */" "\"quoted string\"" "ident1" "// last line comment"; // initialize a lexer with a buffer of text if((rc = lexCreate(h,buf,strlen(buf), kReturnSpaceLexFl | kReturnCommentsLexFl)) != kOkRC ) { cwLogError(rc,"Lexer initialization failed."); return; } // register some additional recoginizers lexRegisterToken(h,kUserLexTId+1,"+"); lexRegisterToken(h,kUserLexTId+2,"-"); // ask for token id's while( (tid = lexGetNextToken(h)) != kEofLexTId ) { // print information about each token cwLogInfo("%i %i %s '%.*s' (%i) ", lexCurrentLineNumber(h), lexCurrentColumnNumber(h), lexIdToLabel(h,tid), lexTokenCharCount(h), lexTokenText(h) , lexTokenCharCount(h)); // if the token is a number ... if( tid==kIntLexTId || tid==kRealLexTId || tid==kHexLexTId ) { // ... then request the numbers value int iv = lexTokenInt(h); double dv = lexTokenDouble(h); cwLogInfo("%i %f",iv,dv); } cwLogInfo("\n"); // handle errors if( tid == kErrorLexTId ) { cwLogInfo("Error:%i\n", lexErrorRC(h)); break; } } // finalize the lexer lexDestroy(h); } } //) //}