cmXml.h/c : Updated but still incomplete.

2015-12-10 17:43:17 -05:00 · 2015-12-10 17:43:17 -05:00 · a839c3dd92
commit a839c3dd92
parent a2a613f3b8
2 changed files with 553 additions and 61 deletions
--- a/cmXml.c
+++ b/cmXml.c
@ -22,7 +22,7 @@ node     -> beg-node node-body end-node
 node-body -> data-text
          |  node
  
-beg-node   -> "<" tag-label  attr-list ">"
+beg-node   -> "<" tag-label  attr-list {"/"} ">"
 end-node   -> "<" tag-label "/>"
 attr-list  -> attr*
 attr       -> attr-label "=" qstring
@ -41,69 +41,96 @@ dt-text    -> A string of characters beginning with a non-whitespace

 cmmt-text  -> A string of characters ending with '-->'

-*/  
+*/
+
+/*
+
+
+t = get_next_attr_token(p,end_char, tn* )
+{
+
+}
+
+parse_attr_list(p,end_char)
+{
+}
+
+read_beg_tag(p)
+{
+   c = goto_next_non_white_char(p)
+
+   if( c != '<' )
+     error();
+
+   c = goto_next_non_white_char(p)
+
+   if c == '?'
+   {
+      end_tag_str = "?";
+      if( scan_past(p,"xml") == false )
+        error();
+
+      parse_attr_list(p,'?');
+   }
+
+   if c == '!'
+   {
+      if( scan_past(p,"--") )
+      {
+         if(go_past(p,"-->")==false)
+           error();
+      }
+
+      if( scan_past(p,"DOCTYPE") )
+      {
+          while( s = get_next_attr_token(p,'>') != NULL )
+             store_attr(p,s,"");
+      }
+   }
+    
+}
+
+read_body( p )
+{
+  c = goto_next_non_white_char(p);
+
+  if c == '<'
+    read_node(p)
+  else
+    read_data_string(p)
+}
+
+n = read_node( p )
+{
+   t = read_beg_tag(p);
+
+   if( is_beg_tag(t) )
+   {
+      read_body()
+      read_end_tag()
+   }
+}
+
+ */
 
 cmXmlH_t cmXmlNullHandle = cmSTATIC_NULL_HANDLE;

 typedef struct
 {
-  cmErr_t      err;   // 
-  cmLHeapH_t   heapH; // linked heap stores all node memory
-  cmLexH       lexH; 
-  cmXmlNode_t* root;
+  cmErr_t      err;     // 
+  cmLHeapH_t   heapH;   // linked heap stores all node memory
+  
+  cmChar_t*    b;       // base of the text buffer
+  unsigned     bn;      // length of the text buffer in characters
+  cmChar_t*    c;       // current lexer position
+  
+  cmXmlNode_t* root;    // root XML tree node
+  cmXmlNode_t* decl;    // xml declaratoin node <? ... ?>
+  cmXmlNode_t* doctype; // DOCTYPE  node
+
+  cmXmlNode_t* stack;   // parsing stack
 } cmXml_t;

-enum
-{
-  kTagBegLexTId = kUserLexTId+1,
-  kTagEndLexTId,
-  kDeclBegLexTId,
-  kDeclEndLexTId,
-  kSpclBegLexTId,
-  kDocTypeLexTId,
-  kCmmtBegLexTId,
-  kCmmtEndLexTId,
-  kEqualLexTId
-};
-
-cmXmlToken_t _cmXmlTokenArray[] = 
-{
-  { kTagBegLexTId = kUserLexId+1,  "<" },
-  { kTagEndLexTid,  ">" },
-  { kDeclBegLexTId, "<?" },
-  { kDeclEndLexTid, "?>" },
-  { kSpclBegLexTId, "<!" },
-  { kDocTypeLexTId, "<!DOCTYPE" },
-  { kCmmtBegLexTId, "<!--" },
-  { kCmmtEndLexTid, "-->" },
-  { kEqualLexTid,   "=" },
-  { kErrorLexTId,""}  
-};
-
-// Match a tag label.  
-// A string ending with a <space> or '>'
-unsigned cmLexTagLabelMatcher( const cmChar_t* cp, unsigned cn )
-{
-  for(i=0; i<cn; ++i)
-    if( cp[i] == '>' || isspace(cp[i]) )
-      break;
-  return i>0 ? i-1 : 0;  
-}
-
-unsigned cmLexStringMatcher( const cmChar_t* cp, unsigned cn )
-{
-  for(i=0; i<cn; ++i)
-  {
-    if( cp[i] == ' ')
-      break;
-    
-    if( cp[i] == '<' )
-      break;
-
-  }
-  return i>0 ?   
-}
-

 cmXml_t* _cmXmlHandleToPtr( cmXmlH_t h )
 {
@ -228,3 +255,451 @@ cmXmlRC_t cmXmlParse( cmXmlH_t h, const cmChar_t* fn )
 cmXmlRC_t cmXmlClear( cmXmlH_t h )
 {
 }
+
+cmXmlRC_t _cmXmlSyntaxError( cmXml_t* p )
+{
+  return _cmErrMsg(&p->err,kSyntaxErrorXmlRC,"Syntax error on line '%i.",p->line);
+}
+
+cmXmlNode_t* _cmXmlNodeAlloc( cmXml_t* p, unsigned flags, const cmChar_t* label, unsigned labelN )
+{
+  cmXmlNode_t* np = cmLhAllocZ(p->heapH,cmXmlNode_t,1);
+
+  if( cmIsFlag(kNormalXmlFl) )
+  {  
+    if( p->root == NULL )
+      p->root = np;
+
+    if( p->stack == NULL )
+      p->stack = np;
+    else
+    {
+      np->parent = p->stack;
+      
+      if( p->stack->children == NULL )
+        p->stack->children = np;
+      else
+      {
+        cmXmlNode_t* n0p = NULL;
+        cmXmlNode_t* n1p = p->stack->children;
+        
+        for(; n1p != NULL; n1p=n1p->sibling )
+          n0p = n1p;
+
+        n0p->sibling = np;
+      }
+    }
+  }
+  else
+  {
+    if( cmIsFlag(kDeclXmlFl) )
+      p->decl = np;
+    else
+    {
+      if( cmIsFlag(kDoctypeXmlF0 ) )
+        p->doctype = np;
+      else
+      {
+        _cmXmlSyntaxError(p);
+        return NULL;
+      }
+    }
+  }
+  
+  if( label != NULL )
+    np->label = cmLhAllocStrN(p->heapH,label,labelN);
+
+  return np;
+}
+
+cmXmlNode_t* _cmXmlAttrAlloc( cmXml_t* p, cmXmlNode_t* np, const cmChar_t* label, unsigned labelN, const cmChar_t* value, unsigned valueN )
+{
+  cmXmlAttr_t* ap = cmLhAllocZ(p->heapH, cmXmlAttr_t,1);
+
+  if( label != NULL && labelN > 0 )
+    ap->label = cmLhAllocStr(p->heapH,label,labelN);
+
+  if( value != NULL and valueN > 0 )
+    ap->value = cmLhAllocStr(p->attrH,value,valueN);
+  
+  ap->link  = np->attr;
+  np->attr  = ap;
+  
+  return np;
+}
+
+
+bool _cmXmlIsEof( cmXml_t* p )
+{  return p->c >= p->b + p->bn; }
+
+// Return false if EOF is encountered
+bool _cmXmlAdvance( cmXml_t* p )
+{
+  if( _cmXmlIsEof(p) )
+    return false;
+
+  p->c += 1;
+
+  if( *p->c == '\n' )
+    p->line += 1;
+  
+  return true;
+}
+
+// Advance the cursor to the next non-white char
+// Return a pointer to a non-space character.
+// Return NULL if the EOF is encountered.
+const cmChar_t*  _cmXmlAdvanceToNextNonWhite( cmXml_t* p )
+{
+  if( _cmXmlIsEof(p) )
+    return NULL;
+  
+  while( isspace(*p->c) )
+    if( _cmXmlAdvance(p) == false )
+      return NULL;
+
+  return p->c;
+}
+
+// Advance to the next white space character or 'c'.
+// Returns a pointer to a white space or 'c'.
+const cmChar_t*  _cmXmlAdvanceToNextWhiteOr( cmXml_t* p, cmChar_t c0, cmChar_t c1 )
+{
+  if( _cmXmlIsEof(p) )
+    return NULL;
+
+  while( isspace(*p->c)==false && *p->c!=c0 && *p->c!=c1 )
+    if(_cmXmlAdvance(p) == false )
+      return NULL;
+
+  return p->c;
+}
+
+// Advance past leading white space followed by 's'.
+// Note that 's' is expected to immediately follow any leading white space.
+// Returns a pointer to the character after 's'.
+// Returns NULL if 'c' is not encountered
+const cmChar_t* _cmXmlAdvancePast( cmXml_t* p, const cmChar_t* s )
+{
+  if( _cmXmlIsEof(p) )
+    return NULL;
+
+  while( isspace(*p->c) )
+    if( _cmXmlAdvance(p) == false )
+      return NULL;
+
+  for(; *s && *p->c == *s; ++s )
+    if( _cmXmlAdvance(p) == false )
+      return NULL;
+
+  return *s==0 ? p->c : NULL; 
+}
+
+// Advance past the current character and then 
+// advance to the next occurrence of 's' and return
+// a pointer to the last char in 's'.
+const cmChar_t* _cmXmlAdvanceToNext( cmXml_t* p, cmChar_t* s )
+{
+  unsigned i = 0;
+  unsigned n = strlen(s);
+
+  while( _cmXmlAdvance(p) )
+  {
+    if( *p->c != s[i] )
+      i = 0;
+    else
+    {
+      i+= 1;
+      if( i == n )
+        break;
+    }
+  }
+  return p->c;
+}
+
+// Return the character following the current character.
+const cmChar_t* _cmXmlAdvanceOne( cmXml_t* p )
+{
+  if( _cmXmlIsEof(p) )
+    return NULL;
+
+  p->c += 1;
+
+  return _cmXmlIsEof(p) ? NULL : p->c;
+}
+
+cmXmlRC_t  _cmXmlParseAttr( cmXml_t* p, cmChar_t endChar )
+{
+  cmXmlRC_t       rc = kOkXmlRC;
+  const cmChar_t* l0 = NULL;
+  const cmChar_t* l1 = NULL;
+  const cmChar_t* v0 = NULL;
+  const cmChar_t* v1 = NULL;
+
+  // advance to the next label
+  if(( l0 = _cmXmlAdvanceToNextNonWhite(p)) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // if the 'endChar' was encountered
+  if( *p->c == endChar )
+    return kOkXmlRC;
+  
+  // advance past last character in label
+  if((l1 = _cmXmlAdvanceToNextWhiteOr(p,'=',' ')) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // advance past the next '='
+  if( _cmXmlAdvancePast(p,"=") == NULL )
+    return _cmXmlSyntaxError(p);
+  
+  // advance to the next non-white character
+  if( (v0 = _cmXmlAdvanceToNextNonWhite(p)) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // the first character in the value must be a single quote
+  if( *p->c != '\'' )
+    return _cmXmlSyntaxError(p);
+  
+  // advance to the next single quote
+  if( (v1 = _cmXmlAdvanceToNext(p,"'")) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // advance past the ending single quote
+  if( _cmXmlAdvanceOne(p) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // p->c now points just past the ending single quote
+  return rc;
+}
+
+cmXmlRC_t _cmXmlParseAttrList( cmXml_t* p, cmChar_t endChar )
+{
+  cmXmlRC_t rc = kOkXmlRC;
+
+  
+  while( *p->c != endChar && *p->c != '>' )
+    if((rc = _cmXmlParseAttr(p,endChar)) != kOkXmlRC )
+      break;
+
+  if( *p->c == endChar )
+  {
+    if( endChar = '/' )
+    {
+      // this is a simple node
+    }
+    
+    if( _cmXmlAdvanceOne(p) == NULL )
+      return _cmXmlSyntaxError(p);
+  }
+  
+  if( *p->c != '>' )
+    return _cmXmlSyntaxError(p);
+  
+  if( _cmXmlAdvancePast(p,">") == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // p->c is now past the ending '>'
+  
+  return rc;
+}
+
+cmXmlRC_t _cmXmlParseDoctypeToken( cmXml_t* p, cmXmlNode_t* np )
+{
+  const cmChar_t* t0 = NULL;
+  const cmChar_t* t1 = NULL;
+
+  // advance to the first char in the doctype token
+  if((t0 = _cmXmlAdvanceToNextNonWhite(p) ) == NULL )
+  {
+    return _cmXmlSyntaxError(p);
+  }
+
+  // if the end of the tag was encountered
+  if( *p->c == '>' )
+      return kOkXmlRC;
+    
+
+  // if the token begins with a quote
+  if( *p->c == '\'' )
+  {
+    if((t1 = _cmXmlAdvanceToNext(p,"'")) == NULL )
+      return _cmXmlSyntaxError(p);
+  }
+  else
+  {
+    if((t1 = _cmXmlAdvanceToNextWhiteOr(p,'>',' ')) == NULL )
+      return _cmXmlSyntaxError(p);
+  }
+
+  // t1 and p->c now point just past the last character in the token
+
+  return rc;  
+}
+
+cmXmlRC_t _cmXmlParseDoctype( cmXml_t* p )
+{
+  cmXmlRC_t rc = kOkXmlRC;
+  cmXmlNode_t* np;
+  
+  if((np = _cmXmlNodeAlloc(p,kDoctypeXmlFl,"DOCTYPE",strlen("DOCTYPE"))) == NULL )
+    return cmErrLastRC(&p->err);
+  
+  while( *p->c != '>' )
+    if((rc = _cmXmlParseDoctypeToken(p,np)) != kOkXmlRC )
+      break;
+
+  return rc;
+}
+
+// Node tags are tags that begin with a '<' and are not
+// followed by any special character.
+cmXmlRC_t _cmXmlParseNodeTag( cmXml_t* p )
+{
+  cmXmlRC_t       rc = kOkXmlRC;
+  const cmChar_t* l0 = NULL;
+  const cmChar_t* l1 = NULL;
+
+  // Advance to the first character of the tag label.
+  if((l0 = _cmXmlAdvanceToNextNonWhite(p)) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // Advance to the last character following the tag label.
+  if((l1 = _cmXmlAdvanceToNextWhiteOr(p,'/','>')) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // look for attributes
+  if((rc = _cmXmlParseAttrList(p,'/')) != kOkXmlRC )
+      return _cmXmlSyntaxError(p);
+
+  // p->c is now past the ending '>'
+  
+  return rc;
+}
+
+cmXmlRC_t _cmXmlReadEndTag( cmXml_t* p )
+{
+  const cmChar_t* l0 = NULL;
+  const cmChar_t* l1 = NULL;
+
+  assert( *p->c == '/' );
+
+  // advance past the '/'
+  if(( l0 = _cmXmlAdvanceOne(p)) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // advance to the ending '>'
+  if(( l1 = _cmXmlAdvanceToNext(p,">")) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // advance past the 
+  if( _cmXmlAdvanceOne(p) == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // trim trailing space on label
+  l1 -= 1;
+  while( l1>l0 && isspace(*l1) )
+    --l1;
+
+  // verify that the label has a length
+  if( l0 == l1 )
+    return _cmXmlSyntaxError(p);
+
+  assert( !isspace(*l1) );
+
+  // the label should match the node on the top of the stack
+  if( strncmp( p->stack->label, l0, (l1-l0)+1 ) )
+    return _cmXmlSyntaxError(p);
+
+  // since we just parsed an end-tag there should be at least one node on the stack
+  if( p->stack == NULL )
+    return _cmXmlSyntaxError(p);
+
+  // pop the stack
+  p->stack = p->stack->parent;
+  
+  return kOkXmlRC;  
+}
+  
+
+
+// 
+cmXmlRC_t  _cmXmlReadTag( cmXml_t* p, cmXmlNode_t** newNodeRef )
+{
+  cmXmlRC_t rc = kOkXmlRC;
+
+  assert(newNodeRef != NULL );
+  *newNodeRef = NULL;
+  
+  // No leading '<' was found 
+  if( _cmXmlAdvancePast(p,"<") == NULL )
+  {
+    // error or EOF
+    return NULL;  
+  }
+
+  // examine the character following the opening '<'
+  switch( *p->c )
+  {
+    // node end tag
+    case '/':
+      return _cmXmlReadEndTag(p);
+    
+    // declaration tag
+    case '?':
+      if( _cmXmlAdvancePast(p,"xml") == NULL )
+        return _cmXmlSyntaxError(p);
+
+      if( _cmXmlNodeAlloc(p,kDeclXmlFl, "xml",strlen("xml") ) == NULL )
+        return cmErrLastRC(&p->err);
+      
+      if((rc = _cmXmlParseAttrList(p,'?')) != kOkXmlRC )
+        return rc;
+      
+      break;
+      
+    case '!':
+      switch( *(p->c+1) )
+      {
+        // comment node
+        case '-':
+          if( _cmXmlAdvancePast(p,"--") == NULL )
+            return _cmXmlSyntaxError(p);
+        
+          if( _cmXmlAdvanceToNext("->") == NULL )
+            return _cmXmlSyntaxError(p);
+
+          // p->c is just after "-->"
+          break;
+          
+          // DOCTYPE node
+        case 'D':
+          if( _cmXmlAdvancePast(P,"DOCTYPE")==NULL )
+            return _cmXmlSyntaxError(p);
+        
+          if((rc = _cmXmlParseDocType(p)) != kOkXmlRC )
+            return _cmXmlSyntaxError(p);
+
+          // p->c is just after ">"
+
+          break;
+        
+        default:
+          return _cmXmlSyntaxError(p);
+      }
+      break;
+      
+    default:
+      // normal node
+      if((rc = _cmXmlParseNodeTag(p)) != kOkXmlRC )
+        return rc;
+
+      // p->c is just after ">"
+      
+  }
+  
+  return rc;
+}
+
+cmXmlRC_t  _cmXmlReadNode( cmXml_t* p )
+{
+}
--- a/cmXml.h
+++ b/cmXml.h
@ -10,22 +10,39 @@ extern "C" {
    kOkXmlRC = cmOkRC,
    kMemAllocErrXmlRC,
    kLHeapXmlRC,
-    kLexErrXmlRC
+    kLexErrXmlRC,
+    kSyntaxErrorXmlRC
  }; 
  
  typedef struct cmXmlAttr_str
  {
-    const cmChar_t*      label;
-    const cmChar_t*      value;
+    const cmChar_t*       label;
+    const cmChar_t*       value;    
    struct cmXmlAttr_str* link;
  } cmXmlAttr_t;
+
+  enum
+  {
+    kDeclXmlFl    = 0x0001,
+    kDoctypeXmlFl = 0x0002,
+    kNormalXmlFl  = 0x0004,
+    
+  };
  
  typedef struct cmXmlNode_str
  {
+    unsigned              flags;
+    
+    const cmChar_t*       label;
+    const cmChar_t*       dataStr;
+    
+    cmXmlAttr_t*          attr;
+    
    struct cmXmlNode_str* parent;
    struct cmXmlNode_str* children;
    struct cmXmlNode_str* sibling;
-    cmXmlAttr_t*          attr;
+    
+    
  } cmXmlNode_t;

  typedef cmHandle_t cmXmlH_t;