Added kReturnUnknownLexFl configuration flag. If this flag is set tokens which are not recognized will be returned with the token id set to kUnknownLexTId. This is useful for returning all text. Added kUserDefPriorityLexFl configuration flag. User defined tokens take priority even if an identifier (kIdentLexTId) has a longer match. Added cmLexEnableToken(). This function allows tokens recognizers to be enabled and disabled.

пре 11 година · 445824a5ed
--- a/cmLex.c
+++ b/cmLex.c
 
															
															   { kFileCloseErrLexRC,    "File close failed on cmLexSetFile()"},
														
 
															
															   { kMemAllocErrLexRC,     "An attempted memory allocation failed"},
														
 
															
															   { kEofRC,                "The end of the input text was encountered (this is a normal condition not an error)"},
														
 
															
															+  { kInvalidLexTIdLexRC,   "An invalid token id was encountered."},
														
 
															
															   { kInvalidLexRC,         "Unknown lexer error code." }
														
 
															
															 };
														
 
															
															   cmLexMatcherFuncPtr_t funcPtr;  // recognizer function (only used if userPtr==NULL)
														
 
															
															   cmChar_t*             tokenStr; // fixed string data used by the recognizer (only used if userPtr==NULL)
														
 
															
															   cmLexUserMatcherPtr_t userPtr;  // user defined recognizer function (only used if funcPtr==NULL)
														
 
															
															+  bool                  enableFl; // true if this matcher is enabled
														
 
															
															 } cmLexMatcher;
														
 
															
															   p->mfp[p->mfi].typeId   = typeId;
														
 
															
															   p->mfp[p->mfi].funcPtr  = funcPtr;
														
 
															
															   p->mfp[p->mfi].userPtr  = userPtr;
														
 
															
															+  p->mfp[p->mfi].enableFl = true;
														
 
															
															   if( keyStr != NULL )
														
 
															
															   {
														
 
															
															   return _cmLexInstallMatcher( p, id, NULL, NULL, userPtr );
														
 
															
															 }
														
 
															
															+cmRC_t             cmLexEnableToken( cmLexH h, unsigned id, bool enableFl )
														
 
															
															+{
														
 
															
															+  cmLex* p = _cmLexHandleToPtr(h);
														
 
															
															+
														
 
															
															+  unsigned mi = 0;
														
 
															
															+  for(; mi<p->mfi; ++mi)
														
 
															
															+    if( p->mfp[mi].typeId == id )
														
 
															
															+    {
														
 
															
															+      p->mfp[mi].enableFl = enableFl;
														
 
															
															+      return cmOkRC;
														
 
															
															+    }
														
 
															
															+
														
 
															
															+  return _cmLexError( p, kInvalidLexTIdLexRC, "%i is not a valid token type id.",id);
														
 
															
															+}
														
 
															
															+
														
 
															
															 unsigned           cmLexFilterFlags( cmLexH h )
														
 
															
															 {
														
 
															
															   cmLex* p = _cmLexHandleToPtr(h);
														
 
															
															     p->curTokenCharCnt = 0;
														
 
															
															+    // try each mater
														
 
															
															     for(; mi<p->mfi; ++mi)
														
 
															
															-    {
														
 
															
															-      unsigned charCnt = 0;
														
 
															
															-      if( p->mfp[mi].funcPtr != NULL )
														
 
															
															-        charCnt = p->mfp[mi].funcPtr(p, p->cp + p->ci, p->cn - p->ci, p->mfp[mi].tokenStr );
														
 
															
															-      else
														
 
															
															-        charCnt = p->mfp[mi].userPtr( p->cp + p->ci, p->cn - p->ci);
														
 
															
															-
														
 
															
															-      if( cmErrLastRC(&p->err) != kOkLexRC )
														
 
															
															-        return kErrorLexTId;
														
 
															
															-
														
 
															
															-      // if this matched token is longer then the prev. matched token or
														
 
															
															-      // if the prev matched token was an identifier and this matched token is an equal length user defined token
														
 
															
															-      if( (charCnt > maxCharCnt) || (charCnt>0 && charCnt==maxCharCnt && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId ) )
														
 
															
															+      if( p->mfp[mi].enableFl )
														
 
															
															       {
														
 
															
															-        maxCharCnt = charCnt;
														
 
															
															-        maxIdx     = mi;
														
 
															
															-      }
														
 
															
															+        unsigned charCnt = 0;
														
 
															
															+        if( p->mfp[mi].funcPtr != NULL )
														
 
															
															+          charCnt = p->mfp[mi].funcPtr(p, p->cp + p->ci, p->cn - p->ci, p->mfp[mi].tokenStr );
														
 
															
															+        else
														
 
															
															+          charCnt = p->mfp[mi].userPtr( p->cp + p->ci, p->cn - p->ci);
														
 
															
															+
														
 
															
															+        if( cmErrLastRC(&p->err) != kOkLexRC )
														
 
															
															+          return kErrorLexTId;
														
 
															
															+
														
 
															
															+        // if this matched token is longer then the prev. matched token or
														
 
															
															+        // if the prev matched token was an identifier and this matched token is an equal length user defined token
														
 
															
															+        if( (charCnt > maxCharCnt) 
														
 
															
															+          || (charCnt>0 && charCnt==maxCharCnt && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId ) 
														
 
															
															+          || (charCnt>0 && charCnt<maxCharCnt  && p->mfp[maxIdx].typeId==kIdentLexTId && p->mfp[mi].typeId >=kUserLexTId && cmIsFlag(p->flags,kUserDefPriorityLexFl))
														
 
															
															+            )
														
 
															
															+        {
														
 
															
															+          maxCharCnt = charCnt;
														
 
															
															+          maxIdx     = mi;
														
 
															
															+        }
														
 
															
															-    }
														
 
															
															+      }
														
 
															
															     // no token was matched
														
 
															
															     if( maxIdx == cmInvalidIdx )
														
 
															
															     {
														
 
															
															-      _cmLexError( p, kNoMatchLexRC, "Unable to recognize token:'%c'.",*(p->cp+p->ci));
														
 
															
															-      return kErrorLexTId;     
														
 
															
															+      if( cmIsFlag(p->flags,kReturnUnknownLexFl) )
														
 
															
															+      {
														
 
															
															+        maxCharCnt = 1;
														
 
															
															+      }
														
 
															
															+      else
														
 
															
															+      {
														
 
															
															+        _cmLexError( p, kNoMatchLexRC, "Unable to recognize token:'%c'.",*(p->cp+p->ci));
														
 
															
															+        return kErrorLexTId;     
														
 
															
															+      }
														
 
															
															     }
														
 
															
															     // update the current line and column position    
														
 
															
															     bool returnFl = true;
														
 
															
															-    // check the space token filter
														
 
															
															-    if( (p->mfp[ maxIdx ].typeId == kSpaceLexTId) && (cmIsFlag(p->flags,kReturnSpaceLexFl)==0) )
														
 
															
															-      returnFl = false;
														
 
															
															+    if( maxIdx != cmInvalidIdx )
														
 
															
															+    {
														
 
															
															+      // check the space token filter
														
 
															
															+      if( (p->mfp[ maxIdx ].typeId == kSpaceLexTId) && (cmIsFlag(p->flags,kReturnSpaceLexFl)==0) )
														
 
															
															+        returnFl = false;
														
 
															
															-    // check the comment token filter
														
 
															
															-    if( _cmLexIsCommentTypeId(p->mfp[ maxIdx ].typeId) && (cmIsFlag(p->flags,kReturnCommentsLexFl)==0) )
														
 
															
															-      returnFl = false;
														
 
															
															+      // check the comment token filter
														
 
															
															+      if( _cmLexIsCommentTypeId(p->mfp[ maxIdx ].typeId) && (cmIsFlag(p->flags,kReturnCommentsLexFl)==0) )
														
 
															
															+        returnFl = false;
														
 
															
															+    }
														
 
															
															     // update the lexer state
														
 
															
															-    p->curTokenId      = p->mfp[ maxIdx ].typeId;    
														
 
															
															+    p->curTokenId      = maxIdx==cmInvalidIdx ? kUnknownLexTId : p->mfp[ maxIdx ].typeId;    
														
 
															
															     p->curTokenCharIdx = p->ci;
														
 
															
															     p->curTokenCharCnt = maxCharCnt;
														
--- a/cmLex.h
+++ b/cmLex.h
 
															
															 enum
														
 
															
															 {
														
 
															
															   kErrorLexTId,    // 0  the lexer was unable to identify the current token
														
 
															
															-  kEofLexTId,      // 1  the lexer reached the end of input
														
 
															
															-  kSpaceLexTId,    // 2  white space
														
 
															
															-  kRealLexTId,     // 3  real number (contains a decimal point or is in scientific notation) 
														
 
															
															-  kIntLexTId,      // 4  decimal integer
														
 
															
															-  kHexLexTId,      // 5  hexidecimal integer
														
 
															
															-  kIdentLexTId,    // 6  identifier
														
 
															
															-  kQStrLexTId,     // 7  quoted string
														
 
															
															-  kBlockCmtLexTId, // 8  block comment
														
 
															
															-  kLineCmtLexTId,  // 9  line comment
														
 
															
															-  kUserLexTId      // 10 user registered token (See cmLexRegisterToken().)
														
 
															
															+  kUnknownLexTId,  // 1  the token is of an unknown type (only used when kReturnUnknownLexFl is set)
														
 
															
															+  kEofLexTId,      // 2  the lexer reached the end of input
														
 
															
															+  kSpaceLexTId,    // 3  white space
														
 
															
															+  kRealLexTId,     // 4  real number (contains a decimal point or is in scientific notation) 
														
 
															
															+  kIntLexTId,      // 5  decimal integer
														
 
															
															+  kHexLexTId,      // 6  hexidecimal integer
														
 
															
															+  kIdentLexTId,    // 7  identifier
														
 
															
															+  kQStrLexTId,     // 8  quoted string
														
 
															
															+  kBlockCmtLexTId, // 9  block comment
														
 
															
															+  kLineCmtLexTId,  // 10  line comment
														
 
															
															+  kUserLexTId      // 11 user registered token (See cmLexRegisterToken().)
														
 
															
															 };
														
 
															
															 // Lexer control flags used with cmLexInit().
														
 
															
															 enum
														
 
															
															 {
														
 
															
															   kReturnSpaceLexFl    = 0x01, //< Return space tokens
														
 
															
															-  kReturnCommentsLexFl = 0x02  //< Return comment tokens
														
 
															
															+  kReturnCommentsLexFl = 0x02, //< Return comment tokens
														
 
															
															+  kReturnUnknownLexFl  = 0x04, //< Return unknown tokens
														
 
															
															+  kUserDefPriorityLexFl= 0x08  //< User defined tokens take priority even if a kIdentLexTId token has a longer match
														
 
															
															 };
														
 
															
															 // cmLex result codes.
														
 
															
															   kFileCloseErrLexRC,      //< 9  File close failed on cmLexSetFile()
														
 
															
															   kMemAllocErrLexRC,       //< 10  An attempted memory allocation failed
														
 
															
															   kEofRC,                  //< 11 The end of the input text was encountered (this is a normal condition not an error)
														
 
															
															-  kInvalidLexRC            //< 12 Sentinal value.
														
 
															
															+  kInvalidLexTIdLexRC,     //< 12 An invalid lex token id was encountered.
														
 
															
															+  kInvalidLexRC            //< 13 Sentinal value.
														
 
															
															 };
														
 
															
															 cmRC_t             cmLexRegisterMatcher( cmLexH h, unsigned id, cmLexUserMatcherPtr_t funcPtr );
														
 
															
															+// Enable or disable the specified token type.
														
 
															
															+cmRC_t             cmLexEnableToken( cmLexH h, unsigned id, bool enableFl );
														
 
															
															+
														
 
															
															 // Get and set the lexer filter flags kReturnXXXLexFl.
														
 
															
															 // These flags can be safely enabled and disabled between
														
 
															
															 // calls to cmLexGetNextToken().