libcm is a C development framework with an emphasis on audio signal processing applications.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

cmLex.c 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983
  1. //| Copyright: (C) 2009-2020 Kevin Larke <contact AT larke DOT org>
  2. //| License: GNU GPL version 3.0 or above. See the accompanying LICENSE file.
  3. #include "cmPrefix.h"
  4. #include "cmGlobal.h"
  5. #include "cmRpt.h"
  6. #include "cmLex.h"
  7. #include "cmErr.h"
  8. #include "cmMem.h"
  9. #include "cmMallocDebug.h"
  10. #include "cmFile.h"
  11. enum
  12. {
  13. kRealFloatLexFl = 0x01,
  14. kIntUnsignedLexFl = 0x02
  15. };
  16. typedef struct
  17. {
  18. unsigned code;
  19. const cmChar_t* msg;
  20. } cmLexErrorRecd;
  21. cmLexErrorRecd cmLexErrorArray[] =
  22. {
  23. { kOkLexRC, "No error. The operation completed successfully."},
  24. { kDuplicateTokenLexRC, "The text or id passed as a user token is already in use by another token."},
  25. { kMissingCmtEndLexRC, "The end of a block comment could not be found."},
  26. { kMissingEndQuoteLexRC, "The end of a quoted string could not be found."},
  27. { kNoMatchLexRC, "The lexer encountered a string which could not be classified."},
  28. { kFileOpenErrLexRC, "File open failed on cmLexSetFile()"},
  29. { kFileSeekErrLexRC, "File seek failed on cmLexSetFile()"},
  30. { kFileTellErrLexRC, "File tell failed on cmLexSetFile()"},
  31. { kFileReadErrLexRC, "File read failed on cmLexSetFile()"},
  32. { kFileCloseErrLexRC, "File close failed on cmLexSetFile()"},
  33. { kMemAllocErrLexRC, "An attempted memory allocation failed"},
  34. { kEofRC, "The end of the input text was encountered (this is a normal condition not an error)"},
  35. { kInvalidLexTIdLexRC, "An invalid token id was encountered."},
  36. { kSignErrorLexRC, "A signed integer has a 'u' or 'U' suffix."},
  37. { kInvalidLexRC, "Unknown lexer error code." }
  38. };
  39. struct cmLex_str;
  40. typedef unsigned (*cmLexMatcherFuncPtr_t)( struct cmLex_str* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr );
  41. // token match function record
  42. typedef struct
  43. {
  44. unsigned typeId; // token type this matcher recognizes
  45. cmLexMatcherFuncPtr_t funcPtr; // recognizer function (only used if userPtr==NULL)
  46. cmChar_t* tokenStr; // fixed string data used by the recognizer (only used if userPtr==NULL)
  47. cmLexUserMatcherPtr_t userPtr; // user defined recognizer function (only used if funcPtr==NULL)
  48. bool enableFl; // true if this matcher is enabled
  49. } cmLexMatcher;
  50. typedef struct cmLex_str
  51. {
  52. cmErr_t err;
  53. const cmChar_t* cp; // character buffer
  54. unsigned cn; // count of characters in buffer
  55. unsigned ci; // current buffer index position
  56. unsigned flags; // lexer control flags
  57. unsigned curTokenId; // type id of the current token
  58. unsigned curTokenCharIdx; // index into cp[] of the current token
  59. unsigned curTokenCharCnt; // count of characters in the current token
  60. unsigned curLine; // line number of the current token
  61. unsigned curCol; // column number of the current token
  62. unsigned nextLine;
  63. unsigned nextCol;
  64. cmChar_t* blockBegCmtStr;
  65. cmChar_t* blockEndCmtStr;
  66. cmChar_t* lineCmtStr;
  67. cmLexMatcher* mfp; // base of matcher array
  68. unsigned mfi; // next available matcher array slot
  69. unsigned mfn; // count of elementes in mfp[]
  70. cmChar_t* textBuf; // text buf used by cmLexSetFile()
  71. unsigned attrFlags; // used to store the int and real suffix type flags
  72. } cmLex;
  73. cmLexH cmLexNullH = { NULL };
  74. bool _cmLexIsNewline( cmChar_t c )
  75. { return c == '\n'; }
  76. bool _cmLexIsCommentTypeId( unsigned typeId )
  77. { return typeId == kBlockCmtLexTId || typeId == kLineCmtLexTId; }
  78. cmLex* _cmLexHandleToPtr( cmLexH h )
  79. {
  80. cmLex* p = (cmLex*)h.h;
  81. assert(p != NULL);
  82. return p;
  83. };
  84. cmRC_t _cmLexError( cmLex* p, unsigned rc, const char* fmt, ... )
  85. {
  86. va_list vl;
  87. va_start(vl,fmt);
  88. unsigned bufCharCnt = 512;
  89. char buf[ bufCharCnt+1 ];
  90. snprintf(buf,bufCharCnt,"Error on line:%i ", p->curLine);
  91. unsigned sn = strlen(buf);
  92. vsnprintf(buf+sn,bufCharCnt-sn,fmt,vl);
  93. buf[bufCharCnt]=0;
  94. cmErrMsg(&p->err,rc,"%s",buf);
  95. va_end(vl);
  96. return rc;
  97. }
  98. // Locate 'keyStr' in cp[cn] and return the index into cp[cn] of the character
  99. // following the last char in 'keyStr'. If keyStr is not found return cmInvalidIdx.
  100. unsigned _cmLexScanTo( const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  101. {
  102. unsigned i = 0;
  103. unsigned n = strlen(keyStr);
  104. if( n <= cn )
  105. for(; i<=cn-n; ++i)
  106. if( strncmp(cp + i, keyStr, n ) == 0 )
  107. return i+n;
  108. return cmInvalidIdx;
  109. }
  110. unsigned _cmLexExactStringMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  111. {
  112. unsigned n = strlen(keyStr);
  113. return strncmp(keyStr,cp,n) == 0 ? n : 0;
  114. }
  115. unsigned _cmLexSpaceMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  116. {
  117. unsigned i=0;
  118. for(; i<cn; ++i)
  119. if( !isspace(cp[i]) )
  120. break;
  121. return i;
  122. }
  123. unsigned _cmLexRealMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  124. {
  125. unsigned i = 0;
  126. unsigned n = 0; // decimal point counter
  127. unsigned d = 0; // digit counter
  128. bool fl = false; // expo flag
  129. for(; i<cn && n<=1; ++i)
  130. {
  131. if( i==0 && cp[i]=='-' ) // allow a leading '-'
  132. continue;
  133. if( isdigit(cp[i]) ) // allow digits
  134. {
  135. ++d;
  136. continue;
  137. }
  138. if( cp[i] == '.' && n==0 ) // allow exactly one decimal point
  139. ++n;
  140. else
  141. break;
  142. }
  143. // if there was at least one digit and the next char is an 'e'
  144. if( d>0 && i<cn && (cp[i] == 'e' || cp[i] == 'E') )
  145. {
  146. unsigned e=0;
  147. ++i;
  148. unsigned j = i;
  149. fl = false;
  150. for(; i<cn; ++i)
  151. {
  152. if( i==j && cp[i]=='-' ) // allow the char following the e to be '-'
  153. continue;
  154. if( isdigit(cp[i]) )
  155. {
  156. ++e;
  157. ++d;
  158. continue;
  159. }
  160. // stop at the first non-digit
  161. break;
  162. }
  163. // an exp exists if digits follwed the 'e'
  164. fl = e > 0;
  165. }
  166. // if at least one digit was found
  167. if( d>0 )
  168. {
  169. // Note that this path allows a string w/o a decimal pt to trigger a match.
  170. if(i<cn)
  171. {
  172. // if the real has a suffix
  173. switch(cp[i])
  174. {
  175. case 'F':
  176. case 'f':
  177. p->attrFlags = cmSetFlag(p->attrFlags,kRealFloatLexFl);
  178. ++i;
  179. break;
  180. }
  181. }
  182. // match w/o suffix return
  183. if( d>0 && (fl || n==1 || cmIsFlag(p->attrFlags,kRealFloatLexFl)) )
  184. return i;
  185. }
  186. return 0; // no-match return
  187. }
  188. unsigned _cmLexIntMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  189. {
  190. unsigned i = 0;
  191. bool signFl = false;
  192. unsigned digitCnt = 0;
  193. for(; i<cn; ++i)
  194. {
  195. if( i==0 && cp[i]=='-' )
  196. {
  197. signFl = true;
  198. continue;
  199. }
  200. if( !isdigit(cp[i]) )
  201. break;
  202. ++digitCnt;
  203. }
  204. // BUG BUG BUG
  205. // If an integer is specified using 'e' notiation
  206. // (see _cmLexRealMatcher()) and the number of exponent places
  207. // specified following the 'e' is positive and >= number of
  208. // digits following the decimal point (in effect zeros are
  209. // padded on the right side) then the value is an integer.
  210. //
  211. // The current implementation recognizes all numeric strings
  212. // containing a decimal point as reals.
  213. // if no integer was found
  214. if( digitCnt==0)
  215. return 0;
  216. // check for suffix
  217. if(i<cn )
  218. {
  219. switch(cp[i])
  220. {
  221. case 'u':
  222. case 'U':
  223. if( signFl )
  224. _cmLexError(p,kSignErrorLexRC,"A signed integer has a 'u' or 'U' suffix.");
  225. else
  226. {
  227. p->attrFlags = cmSetFlag(p->attrFlags,kIntUnsignedLexFl);
  228. ++i;
  229. }
  230. break;
  231. default:
  232. break;
  233. }
  234. }
  235. return i;
  236. }
  237. unsigned _cmLexHexMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  238. {
  239. unsigned i = 0;
  240. if( cn < 3 )
  241. return 0;
  242. if( cp[0]=='0' && cp[1]=='x')
  243. for(i=2; i<cn; ++i)
  244. if( !isxdigit(cp[i]) )
  245. break;
  246. return i;
  247. }
  248. unsigned _cmLexIdentMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  249. {
  250. unsigned i = 0;
  251. if( isalpha(cp[0]) || (cp[0]== '_'))
  252. {
  253. i = 1;
  254. for(; i<cn; ++i)
  255. if( !isalnum(cp[i]) && (cp[i] != '_') )
  256. break;
  257. }
  258. return i;
  259. }
  260. unsigned _cmLexQStrMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  261. {
  262. bool escFl = false;
  263. unsigned i = 0;
  264. if( cp[i] != '"' )
  265. return 0;
  266. for(i=1; i<cn; ++i)
  267. {
  268. if( escFl )
  269. {
  270. escFl = false;
  271. continue;
  272. }
  273. if( cp[i] == '\\' )
  274. {
  275. escFl = true;
  276. continue;
  277. }
  278. if( cp[i] == '"' )
  279. return i+1;
  280. }
  281. return _cmLexError(p, kMissingEndQuoteLexRC, "Missing string literal end quote.");
  282. }
  283. unsigned _cmLexQCharMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  284. {
  285. unsigned i = 0;
  286. if( i >= cn || cp[i]!='\'' )
  287. return 0;
  288. i+=2;
  289. if( i >= cn || cp[i]!='\'')
  290. return 0;
  291. return 3;
  292. }
  293. unsigned _cmLexBlockCmtMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  294. {
  295. unsigned n = strlen(p->blockBegCmtStr);
  296. if( strncmp( p->blockBegCmtStr, cp, n ) == 0 )
  297. {
  298. unsigned i;
  299. if((i = _cmLexScanTo(cp + n, cn-n,p->blockEndCmtStr)) == cmInvalidIdx )
  300. {
  301. _cmLexError(p, kMissingCmtEndLexRC, "Missing end of block comment.");
  302. return 0;
  303. }
  304. return n + i;
  305. }
  306. return 0;
  307. }
  308. unsigned _cmLexLineCmtMatcher( cmLex* p, const cmChar_t* cp, unsigned cn, const cmChar_t* keyStr )
  309. {
  310. unsigned n = strlen(p->lineCmtStr);
  311. if( strncmp( p->lineCmtStr, cp, n ) == 0)
  312. {
  313. unsigned i;
  314. const char newlineStr[] = "\n";
  315. if((i = _cmLexScanTo(cp + n, cn-n, newlineStr)) == cmInvalidIdx )
  316. {
  317. // no EOL was found so the comment must be on the last line of the source
  318. return cn;
  319. }
  320. return n + i;
  321. }
  322. return 0;
  323. }
  324. cmRC_t _cmLexInstallMatcher( cmLex* p, unsigned typeId, cmLexMatcherFuncPtr_t funcPtr, const cmChar_t* keyStr, cmLexUserMatcherPtr_t userPtr )
  325. {
  326. assert( funcPtr==NULL || userPtr==NULL );
  327. assert( !(funcPtr==NULL && userPtr==NULL));
  328. // if there is no space in the user token array - then expand it
  329. if( p->mfi == p->mfn )
  330. {
  331. int incr_cnt = 10;
  332. cmLexMatcher* np = cmMemAllocZ( cmLexMatcher, p->mfn + incr_cnt );
  333. memcpy(np,p->mfp,p->mfi*sizeof(cmLexMatcher));
  334. cmMemPtrFree(&p->mfp);
  335. p->mfp = np;
  336. p->mfn += incr_cnt;
  337. }
  338. p->mfp[p->mfi].tokenStr = NULL;
  339. p->mfp[p->mfi].typeId = typeId;
  340. p->mfp[p->mfi].funcPtr = funcPtr;
  341. p->mfp[p->mfi].userPtr = userPtr;
  342. p->mfp[p->mfi].enableFl = true;
  343. if( keyStr != NULL )
  344. {
  345. // allocate space for the token string and store it
  346. p->mfp[p->mfi].tokenStr = cmMemAlloc( cmChar_t, sizeof(cmChar_t) * (strlen(keyStr)+1) );
  347. strcpy(p->mfp[p->mfi].tokenStr, keyStr );
  348. }
  349. p->mfi++;
  350. return kOkLexRC;
  351. }
  352. cmRC_t _cmLexReset( cmLex* p )
  353. {
  354. p->ci = 0;
  355. p->curTokenId = kErrorLexTId;
  356. p->curTokenCharIdx = cmInvalidIdx;
  357. p->curTokenCharCnt = 0;
  358. p->curLine = 0;
  359. p->curCol = 0;
  360. p->nextLine = 0;
  361. p->nextCol = 0;
  362. cmErrClearRC(&p->err);
  363. return kOkLexRC;
  364. }
  365. cmRC_t _cmLexSetTextBuffer( cmLex* p, const cmChar_t* cp, unsigned cn )
  366. {
  367. p->cp = cp;
  368. p->cn = cn;
  369. return _cmLexReset(p);
  370. }
  371. cmLexH cmLexInit( const cmChar_t* cp, unsigned cn, unsigned flags, cmRpt_t* rpt )
  372. {
  373. cmLexH h;
  374. cmChar_t dfltLineCmt[] = "//";
  375. cmChar_t dfltBlockBegCmt[] = "/*";
  376. cmChar_t dfltBlockEndCmt[] = "*/";
  377. cmLex* p = cmMemAllocZ( cmLex, 1 );
  378. cmErrSetup(&p->err,rpt,"Lexer");
  379. p->flags = flags;
  380. _cmLexSetTextBuffer( p, cp, cn );
  381. int init_mfn = 10;
  382. p->mfp = cmMemAllocZ( cmLexMatcher, init_mfn );
  383. p->mfn = init_mfn;
  384. p->mfi = 0;
  385. p->lineCmtStr = cmMemAlloc( cmChar_t, strlen(dfltLineCmt)+1 );
  386. strcpy( p->lineCmtStr, dfltLineCmt );
  387. p->blockBegCmtStr = cmMemAlloc( cmChar_t, strlen(dfltBlockBegCmt)+1 );
  388. strcpy( p->blockBegCmtStr, dfltBlockBegCmt );
  389. p->blockEndCmtStr = cmMemAlloc( cmChar_t, strlen(dfltBlockEndCmt)+1 );
  390. strcpy( p->blockEndCmtStr, dfltBlockEndCmt );
  391. _cmLexInstallMatcher( p, kSpaceLexTId, _cmLexSpaceMatcher, NULL, NULL );
  392. _cmLexInstallMatcher( p, kRealLexTId, _cmLexRealMatcher, NULL, NULL );
  393. _cmLexInstallMatcher( p, kIntLexTId, _cmLexIntMatcher, NULL, NULL );
  394. _cmLexInstallMatcher( p, kHexLexTId, _cmLexHexMatcher, NULL, NULL );
  395. _cmLexInstallMatcher( p, kIdentLexTId, _cmLexIdentMatcher, NULL, NULL );
  396. _cmLexInstallMatcher( p, kQStrLexTId, _cmLexQStrMatcher, NULL, NULL );
  397. _cmLexInstallMatcher( p, kBlockCmtLexTId, _cmLexBlockCmtMatcher, NULL, NULL );
  398. _cmLexInstallMatcher( p, kLineCmtLexTId, _cmLexLineCmtMatcher, NULL, NULL );
  399. if( cmIsFlag(flags,kReturnQCharLexFl) )
  400. _cmLexInstallMatcher( p, kQCharLexTId, _cmLexQCharMatcher, NULL, NULL );
  401. h.h = p;
  402. _cmLexReset(p);
  403. return h;
  404. }
  405. cmRC_t cmLexFinal( cmLexH* hp )
  406. {
  407. if( hp == NULL || cmLexIsValid(*hp)==false )
  408. return cmOkRC;
  409. cmLex* p = _cmLexHandleToPtr(*hp);
  410. if( p != NULL )
  411. {
  412. if( p->mfp != NULL )
  413. {
  414. unsigned i = 0;
  415. // free the user token strings
  416. for(; i<p->mfi; ++i)
  417. if( p->mfp[i].tokenStr != NULL )
  418. cmMemPtrFree(&p->mfp[i].tokenStr);
  419. // free the matcher array
  420. cmMemPtrFree(&p->mfp);
  421. p->mfi = 0;
  422. p->mfn = 0;
  423. }
  424. cmMemPtrFree(&p->lineCmtStr);
  425. cmMemPtrFree(&p->blockBegCmtStr);
  426. cmMemPtrFree(&p->blockEndCmtStr);
  427. cmMemPtrFree(&p->textBuf);
  428. // free the lexer object
  429. cmMemPtrFree(&p);
  430. hp->h = NULL;
  431. }
  432. return kOkLexRC;
  433. }
  434. cmRC_t cmLexReset( cmLexH h )
  435. {
  436. cmLex* p = _cmLexHandleToPtr(h);
  437. return _cmLexReset(p);
  438. }
  439. bool cmLexIsValid( cmLexH h )
  440. { return h.h != NULL; }
  441. cmRC_t cmLexSetTextBuffer( cmLexH h, const cmChar_t* cp, unsigned cn )
  442. {
  443. cmLex* p = _cmLexHandleToPtr(h);
  444. return _cmLexSetTextBuffer(p,cp,cn);
  445. }
  446. cmRC_t cmLexSetFile( cmLexH h, const cmChar_t* fn )
  447. {
  448. cmRC_t rc = kOkLexRC;
  449. cmFileH_t fh = cmFileNullHandle;
  450. cmLex* p = _cmLexHandleToPtr(h);
  451. long n = 0;
  452. assert( fn != NULL && p != NULL );
  453. // open the file
  454. if( cmFileOpen(&fh,fn,kReadFileFl,p->err.rpt) != kOkFileRC )
  455. return kFileOpenErrLexRC;
  456. // seek to the end of the file
  457. if( cmFileSeek(fh,kEndFileFl,0) != kOkFileRC )
  458. return kFileSeekErrLexRC;
  459. // get the length of the file
  460. if( cmFileTell(fh,&n) != kOkFileRC )
  461. return kFileTellErrLexRC;
  462. // rewind to the beginning of the file
  463. if( cmFileSeek(fh,kBeginFileFl,0) != kOkFileRC )
  464. return kFileSeekErrLexRC;
  465. // allocate the text buffer
  466. if((p->textBuf = cmMemResizeZ( char, p->textBuf, n+1)) == NULL )
  467. {
  468. rc = _cmLexError(p,kMemAllocErrLexRC,"Unable to allocate the text file buffer for:'%s'.",fn);
  469. goto errLabel;
  470. }
  471. // read the file into the buffer
  472. if( cmFileRead(fh,p->textBuf,n) != kOkFileRC )
  473. return kFileReadErrLexRC;
  474. if((rc = _cmLexSetTextBuffer( p, p->textBuf, n )) != kOkLexRC )
  475. goto errLabel;
  476. errLabel:
  477. // close the file
  478. if( cmFileClose(&fh) != kOkFileRC )
  479. return kFileCloseErrLexRC;
  480. return rc;
  481. }
  482. cmLexMatcher* _cmLexFindUserToken( cmLex* p, unsigned id, const cmChar_t* tokenStr )
  483. {
  484. unsigned i = 0;
  485. for(; i<p->mfi; ++i)
  486. {
  487. if( id != cmInvalidId && p->mfp[i].typeId == id )
  488. return p->mfp + i;
  489. if( p->mfp[i].tokenStr != NULL && tokenStr != NULL && strcmp(p->mfp[i].tokenStr,tokenStr)==0 )
  490. return p->mfp + i;
  491. }
  492. return NULL;
  493. }
  494. cmRC_t cmLexRegisterToken( cmLexH h, unsigned id, const cmChar_t* tokenStr )
  495. {
  496. cmLex* p = _cmLexHandleToPtr(h);
  497. // prevent duplicate tokens
  498. if( _cmLexFindUserToken( p, id, tokenStr ) != NULL )
  499. return _cmLexError( p, kDuplicateTokenLexRC, "id:%i token:%s duplicates the token string or id", id, tokenStr );
  500. return _cmLexInstallMatcher( p, id, _cmLexExactStringMatcher, tokenStr, NULL );
  501. }
  502. cmRC_t cmLexRegisterMatcher( cmLexH h, unsigned id, cmLexUserMatcherPtr_t userPtr )
  503. {
  504. cmLex* p = _cmLexHandleToPtr(h);
  505. // prevent duplicate tokens
  506. if( _cmLexFindUserToken( p, id, NULL ) != NULL )
  507. return _cmLexError( p, kDuplicateTokenLexRC, "A token matching function has already been installed for token id: %i", id );
  508. return _cmLexInstallMatcher( p, id, NULL, NULL, userPtr );
  509. }
  510. cmRC_t cmLexEnableToken( cmLexH h, unsigned id, bool enableFl )
  511. {
  512. cmLex* p = _cmLexHandleToPtr(h);
  513. unsigned mi = 0;
  514. for(; mi<p->mfi; ++mi)
  515. if( p->mfp[mi].typeId == id )
  516. {
  517. p->mfp[mi].enableFl = enableFl;
  518. return cmOkRC;
  519. }
  520. return _cmLexError( p, kInvalidLexTIdLexRC, "%i is not a valid token type id.",id);
  521. }
  522. unsigned cmLexFilterFlags( cmLexH h )
  523. {
  524. cmLex* p = _cmLexHandleToPtr(h);
  525. return p->flags;
  526. }
  527. void cmLexSetFilterFlags( cmLexH h, unsigned flags )
  528. {
  529. cmLex* p = _cmLexHandleToPtr(h);
  530. p->flags = flags;
  531. }
  532. unsigned cmLexGetNextToken( cmLexH h )
  533. {
  534. cmLex* p = _cmLexHandleToPtr(h);
  535. if( cmErrLastRC(&p->err) != kOkLexRC )
  536. return kErrorLexTId;
  537. while( p->ci < p->cn )
  538. {
  539. unsigned i;
  540. unsigned mi = 0;
  541. unsigned maxCharCnt = 0;
  542. unsigned maxIdx = cmInvalidIdx;
  543. p->curTokenId = kErrorLexTId;
  544. p->curTokenCharIdx = cmInvalidIdx;
  545. p->curTokenCharCnt = 0;
  546. p->attrFlags = 0;
  547. // try each matcher
  548. for(; mi<p->mfi; ++mi)
  549. if( p->mfp[mi].enableFl )
  550. {
  551. unsigned charCnt = 0;
  552. if( p->mfp[mi].funcPtr != NULL )
  553. charCnt = p->mfp[mi].funcPtr(p, p->cp + p->ci, p->cn - p->ci, p->mfp[mi].tokenStr );
  554. else
  555. charCnt = p->mfp[mi].userPtr( p->cp + p->ci, p->cn - p->ci);
  556. // notice if the matcher set the error code
  557. if( cmErrLastRC(&p->err) != kOkLexRC )
  558. return kErrorLexTId;
  559. // if this matched token is longer then the prev. matched token or
  560. // if the prev matched token was an identifier and this matched token is an equal length user defined token
  561. if( (charCnt > maxCharCnt)
  562. || (charCnt>0 && charCnt==maxCharCnt && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId )
  563. || (charCnt>0 && charCnt<maxCharCnt && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId && cmIsFlag(p->flags,kUserDefPriorityLexFl))
  564. )
  565. {
  566. maxCharCnt = charCnt;
  567. maxIdx = mi;
  568. }
  569. }
  570. // no token was matched
  571. if( maxIdx == cmInvalidIdx )
  572. {
  573. if( cmIsFlag(p->flags,kReturnUnknownLexFl) )
  574. {
  575. maxCharCnt = 1;
  576. }
  577. else
  578. {
  579. _cmLexError( p, kNoMatchLexRC, "Unable to recognize token:'%c'.",*(p->cp+p->ci));
  580. return kErrorLexTId;
  581. }
  582. }
  583. // update the current line and column position
  584. p->curLine = p->nextLine;
  585. p->curCol = p->nextCol;
  586. // find the next column and line position
  587. for(i=0; i<maxCharCnt; ++i)
  588. {
  589. if( _cmLexIsNewline(p->cp[ p->ci + i ]) )
  590. {
  591. p->nextLine++;
  592. p->nextCol = 1;
  593. }
  594. else
  595. p->nextCol++;
  596. }
  597. bool returnFl = true;
  598. if( maxIdx != cmInvalidIdx )
  599. {
  600. // check the space token filter
  601. if( (p->mfp[ maxIdx ].typeId == kSpaceLexTId) && (cmIsFlag(p->flags,kReturnSpaceLexFl)==0) )
  602. returnFl = false;
  603. // check the comment token filter
  604. if( _cmLexIsCommentTypeId(p->mfp[ maxIdx ].typeId) && (cmIsFlag(p->flags,kReturnCommentsLexFl)==0) )
  605. returnFl = false;
  606. }
  607. // update the lexer state
  608. p->curTokenId = maxIdx==cmInvalidIdx ? kUnknownLexTId : p->mfp[ maxIdx ].typeId;
  609. p->curTokenCharIdx = p->ci;
  610. p->curTokenCharCnt = maxCharCnt;
  611. // advance the text buffer
  612. p->ci += maxCharCnt;
  613. if( returnFl )
  614. return p->curTokenId;
  615. }
  616. cmErrSetRC(&p->err,kEofRC);
  617. return kEofLexTId;
  618. }
  619. unsigned cmLexTokenId( cmLexH h )
  620. {
  621. cmLex* p = _cmLexHandleToPtr(h);
  622. return p->curTokenId;
  623. }
  624. const cmChar_t* cmLexTokenText( cmLexH h )
  625. {
  626. cmLex* p = _cmLexHandleToPtr(h);
  627. if( p->curTokenCharIdx == cmInvalidIdx )
  628. return NULL;
  629. unsigned n = p->curTokenId == kQStrLexTId ? 1 : 0;
  630. return p->cp + p->curTokenCharIdx + n;
  631. }
  632. unsigned cmLexTokenCharCount( cmLexH h )
  633. {
  634. cmLex* p = _cmLexHandleToPtr(h);
  635. if( p->curTokenCharIdx == cmInvalidIdx )
  636. return 0;
  637. unsigned n = p->curTokenId == kQStrLexTId ? 2 : 0;
  638. return p->curTokenCharCnt - n;
  639. }
  640. int cmLexTokenInt( cmLexH h )
  641. { return strtol( cmLexTokenText(h),NULL,0 ); }
  642. unsigned cmLexTokenUInt( cmLexH h )
  643. { return strtol( cmLexTokenText(h),NULL,0 ); }
  644. float cmLexTokenFloat( cmLexH h )
  645. { return strtof( cmLexTokenText(h),NULL ); }
  646. double cmLexTokenDouble( cmLexH h )
  647. { return strtod( cmLexTokenText(h),NULL ); }
  648. bool cmLexTokenIsUnsigned( cmLexH h )
  649. {
  650. cmLex* p = _cmLexHandleToPtr(h);
  651. return p->curTokenId == kIntLexTId && cmIsFlag(p->attrFlags,kIntUnsignedLexFl);
  652. }
  653. bool cmLexTokenIsSinglePrecision( cmLexH h )
  654. {
  655. cmLex* p = _cmLexHandleToPtr(h);
  656. return p->curTokenId == kRealLexTId && cmIsFlag(p->attrFlags,kRealFloatLexFl);
  657. }
  658. unsigned cmLexCurrentLineNumber( cmLexH h )
  659. {
  660. cmLex* p = _cmLexHandleToPtr(h);
  661. return p->curLine + 1;
  662. }
  663. unsigned cmLexCurrentColumnNumber( cmLexH h )
  664. {
  665. cmLex* p = _cmLexHandleToPtr(h);
  666. return p->curCol + 1;
  667. }
  668. unsigned cmLexErrorRC( cmLexH h )
  669. {
  670. cmLex* p = _cmLexHandleToPtr(h);
  671. return cmErrLastRC(&p->err);
  672. }
  673. const cmChar_t* cmLexIdToLabel( cmLexH h, unsigned typeId )
  674. {
  675. cmLex* p = _cmLexHandleToPtr(h);
  676. switch( typeId )
  677. {
  678. case kErrorLexTId: return "<error>";
  679. case kEofLexTId: return "<EOF>";
  680. case kSpaceLexTId: return "<space>";
  681. case kRealLexTId: return "<real>";
  682. case kIntLexTId: return "<int>";
  683. case kHexLexTId: return "<hex>";
  684. case kIdentLexTId: return "<ident>";
  685. case kQStrLexTId: return "<qstr>";
  686. case kBlockCmtLexTId: return "<bcmt>";
  687. case kLineCmtLexTId: return "<lcmt>";
  688. default:
  689. {
  690. cmLexMatcher* mp;
  691. if((mp = _cmLexFindUserToken(p,typeId,NULL)) == NULL )
  692. return "<unknown>";
  693. return mp->tokenStr;
  694. }
  695. }
  696. return "<invalid>";
  697. }
  698. const cmChar_t* cmLexRcToMsg( unsigned rc )
  699. {
  700. unsigned i=0;
  701. for(i=0; cmLexErrorArray[i].code != kInvalidLexRC; ++i)
  702. if( cmLexErrorArray[i].code == rc )
  703. break;
  704. return cmLexErrorArray[i].msg;
  705. }
  706. //( { label:cmLexEx }
  707. //
  708. // cmLexTest() gives a simple cmLex example.
  709. //
  710. void cmLexTest( cmRpt_t* rpt)
  711. {
  712. cmChar_t buf[] =
  713. "123ident0\n 123.456\nident0\n"
  714. "0xa12+.2\n"
  715. "// comment \n"
  716. "/* block \n"
  717. "comment */"
  718. "\"quoted string\""
  719. "ident1"
  720. "// last line comment";
  721. // initialize a lexer with a buffer of text
  722. cmLexH h = cmLexInit(buf,strlen(buf),
  723. kReturnSpaceLexFl | kReturnCommentsLexFl,rpt);
  724. // verify that the lexer initialization succeded.
  725. if( cmLexIsValid(h) == false )
  726. {
  727. cmRptPrintf(rpt,"Lexer initialization failed.");
  728. return;
  729. }
  730. // register some additional recoginizers
  731. cmLexRegisterToken(h,kUserLexTId+1,"+");
  732. cmLexRegisterToken(h,kUserLexTId+2,"-");
  733. unsigned tid;
  734. // ask for token id's
  735. while( (tid = cmLexGetNextToken(h)) != kEofLexTId )
  736. {
  737. // print information about each token
  738. cmRptPrintf(rpt,"%i %i %s '%.*s' (%i) ",
  739. cmLexCurrentLineNumber(h),
  740. cmLexCurrentColumnNumber(h),
  741. cmLexIdToLabel(h,tid),
  742. cmLexTokenCharCount(h),
  743. cmLexTokenText(h) ,
  744. cmLexTokenCharCount(h));
  745. // if the token is a number ...
  746. if( tid==kIntLexTId || tid==kRealLexTId || tid==kHexLexTId )
  747. {
  748. // ... then request the numbers value
  749. int iv = cmLexTokenInt(h);
  750. double dv = cmLexTokenDouble(h);
  751. cmRptPrintf(rpt,"%i %f",iv,dv);
  752. }
  753. cmRptPrintf(rpt,"\n");
  754. // handle errors
  755. if( tid == kErrorLexTId )
  756. {
  757. cmRptPrintf(rpt,"Error:%i\n", cmLexErrorRC(h));
  758. break;
  759. }
  760. }
  761. // finalize the lexer
  762. cmLexFinal(&h);
  763. }
  764. //)