Browse Source

cmXml.h/c : Updated but still incomplete.

master
kevin 9 years ago
parent
commit
a839c3dd92
2 changed files with 550 additions and 58 deletions
  1. 529
    54
      cmXml.c
  2. 21
    4
      cmXml.h

+ 529
- 54
cmXml.c View File

@@ -22,7 +22,7 @@ node     -> beg-node node-body end-node
22 22
 node-body -> data-text
23 23
           |  node
24 24
   
25
-beg-node   -> "<" tag-label  attr-list ">"
25
+beg-node   -> "<" tag-label  attr-list {"/"} ">"
26 26
 end-node   -> "<" tag-label "/>"
27 27
 attr-list  -> attr*
28 28
 attr       -> attr-label "=" qstring
@@ -41,69 +41,96 @@ dt-text    -> A string of characters beginning with a non-whitespace
41 41
 
42 42
 cmmt-text  -> A string of characters ending with '-->'
43 43
 
44
-*/  
45
- 
46
-cmXmlH_t cmXmlNullHandle = cmSTATIC_NULL_HANDLE;
44
+*/
47 45
 
48
-typedef struct
46
+/*
47
+
48
+
49
+t = get_next_attr_token(p,end_char, tn* )
49 50
 {
50
-  cmErr_t      err;   // 
51
-  cmLHeapH_t   heapH; // linked heap stores all node memory
52
-  cmLexH       lexH; 
53
-  cmXmlNode_t* root;
54
-} cmXml_t;
55 51
 
56
-enum
57
-{
58
-  kTagBegLexTId = kUserLexTId+1,
59
-  kTagEndLexTId,
60
-  kDeclBegLexTId,
61
-  kDeclEndLexTId,
62
-  kSpclBegLexTId,
63
-  kDocTypeLexTId,
64
-  kCmmtBegLexTId,
65
-  kCmmtEndLexTId,
66
-  kEqualLexTId
67
-};
68
-
69
-cmXmlToken_t _cmXmlTokenArray[] = 
70
-{
71
-  { kTagBegLexTId = kUserLexId+1,  "<" },
72
-  { kTagEndLexTid,  ">" },
73
-  { kDeclBegLexTId, "<?" },
74
-  { kDeclEndLexTid, "?>" },
75
-  { kSpclBegLexTId, "<!" },
76
-  { kDocTypeLexTId, "<!DOCTYPE" },
77
-  { kCmmtBegLexTId, "<!--" },
78
-  { kCmmtEndLexTid, "-->" },
79
-  { kEqualLexTid,   "=" },
80
-  { kErrorLexTId,""}  
81
-};
82
-
83
-// Match a tag label.  
84
-// A string ending with a <space> or '>'
85
-unsigned cmLexTagLabelMatcher( const cmChar_t* cp, unsigned cn )
86
-{
87
-  for(i=0; i<cn; ++i)
88
-    if( cp[i] == '>' || isspace(cp[i]) )
89
-      break;
90
-  return i>0 ? i-1 : 0;  
91 52
 }
92 53
 
93
-unsigned cmLexStringMatcher( const cmChar_t* cp, unsigned cn )
54
+parse_attr_list(p,end_char)
94 55
 {
95
-  for(i=0; i<cn; ++i)
96
-  {
97
-    if( cp[i] == ' ')
98
-      break;
56
+}
57
+
58
+read_beg_tag(p)
59
+{
60
+   c = goto_next_non_white_char(p)
61
+
62
+   if( c != '<' )
63
+     error();
64
+
65
+   c = goto_next_non_white_char(p)
66
+
67
+   if c == '?'
68
+   {
69
+      end_tag_str = "?";
70
+      if( scan_past(p,"xml") == false )
71
+        error();
72
+
73
+      parse_attr_list(p,'?');
74
+   }
75
+
76
+   if c == '!'
77
+   {
78
+      if( scan_past(p,"--") )
79
+      {
80
+         if(go_past(p,"-->")==false)
81
+           error();
82
+      }
83
+
84
+      if( scan_past(p,"DOCTYPE") )
85
+      {
86
+          while( s = get_next_attr_token(p,'>') != NULL )
87
+             store_attr(p,s,"");
88
+      }
89
+   }
99 90
     
100
-    if( cp[i] == '<' )
101
-      break;
91
+}
102 92
 
103
-  }
104
-  return i>0 ?   
93
+read_body( p )
94
+{
95
+  c = goto_next_non_white_char(p);
96
+
97
+  if c == '<'
98
+    read_node(p)
99
+  else
100
+    read_data_string(p)
105 101
 }
106 102
 
103
+n = read_node( p )
104
+{
105
+   t = read_beg_tag(p);
106
+
107
+   if( is_beg_tag(t) )
108
+   {
109
+      read_body()
110
+      read_end_tag()
111
+   }
112
+}
113
+
114
+ */
115
+ 
116
+cmXmlH_t cmXmlNullHandle = cmSTATIC_NULL_HANDLE;
117
+
118
+typedef struct
119
+{
120
+  cmErr_t      err;     // 
121
+  cmLHeapH_t   heapH;   // linked heap stores all node memory
122
+  
123
+  cmChar_t*    b;       // base of the text buffer
124
+  unsigned     bn;      // length of the text buffer in characters
125
+  cmChar_t*    c;       // current lexer position
126
+  
127
+  cmXmlNode_t* root;    // root XML tree node
128
+  cmXmlNode_t* decl;    // xml declaratoin node <? ... ?>
129
+  cmXmlNode_t* doctype; // DOCTYPE  node
130
+
131
+  cmXmlNode_t* stack;   // parsing stack
132
+} cmXml_t;
133
+
107 134
 
108 135
 cmXml_t* _cmXmlHandleToPtr( cmXmlH_t h )
109 136
 {
@@ -228,3 +255,451 @@ cmXmlRC_t cmXmlParse( cmXmlH_t h, const cmChar_t* fn )
228 255
 cmXmlRC_t cmXmlClear( cmXmlH_t h )
229 256
 {
230 257
 }
258
+
259
+cmXmlRC_t _cmXmlSyntaxError( cmXml_t* p )
260
+{
261
+  return _cmErrMsg(&p->err,kSyntaxErrorXmlRC,"Syntax error on line '%i.",p->line);
262
+}
263
+
264
+cmXmlNode_t* _cmXmlNodeAlloc( cmXml_t* p, unsigned flags, const cmChar_t* label, unsigned labelN )
265
+{
266
+  cmXmlNode_t* np = cmLhAllocZ(p->heapH,cmXmlNode_t,1);
267
+
268
+  if( cmIsFlag(kNormalXmlFl) )
269
+  {  
270
+    if( p->root == NULL )
271
+      p->root = np;
272
+
273
+    if( p->stack == NULL )
274
+      p->stack = np;
275
+    else
276
+    {
277
+      np->parent = p->stack;
278
+      
279
+      if( p->stack->children == NULL )
280
+        p->stack->children = np;
281
+      else
282
+      {
283
+        cmXmlNode_t* n0p = NULL;
284
+        cmXmlNode_t* n1p = p->stack->children;
285
+        
286
+        for(; n1p != NULL; n1p=n1p->sibling )
287
+          n0p = n1p;
288
+
289
+        n0p->sibling = np;
290
+      }
291
+    }
292
+  }
293
+  else
294
+  {
295
+    if( cmIsFlag(kDeclXmlFl) )
296
+      p->decl = np;
297
+    else
298
+    {
299
+      if( cmIsFlag(kDoctypeXmlF0 ) )
300
+        p->doctype = np;
301
+      else
302
+      {
303
+        _cmXmlSyntaxError(p);
304
+        return NULL;
305
+      }
306
+    }
307
+  }
308
+  
309
+  if( label != NULL )
310
+    np->label = cmLhAllocStrN(p->heapH,label,labelN);
311
+
312
+  return np;
313
+}
314
+
315
+cmXmlNode_t* _cmXmlAttrAlloc( cmXml_t* p, cmXmlNode_t* np, const cmChar_t* label, unsigned labelN, const cmChar_t* value, unsigned valueN )
316
+{
317
+  cmXmlAttr_t* ap = cmLhAllocZ(p->heapH, cmXmlAttr_t,1);
318
+
319
+  if( label != NULL && labelN > 0 )
320
+    ap->label = cmLhAllocStr(p->heapH,label,labelN);
321
+
322
+  if( value != NULL and valueN > 0 )
323
+    ap->value = cmLhAllocStr(p->attrH,value,valueN);
324
+  
325
+  ap->link  = np->attr;
326
+  np->attr  = ap;
327
+  
328
+  return np;
329
+}
330
+
331
+
332
+bool _cmXmlIsEof( cmXml_t* p )
333
+{  return p->c >= p->b + p->bn; }
334
+
335
+// Return false if EOF is encountered
336
+bool _cmXmlAdvance( cmXml_t* p )
337
+{
338
+  if( _cmXmlIsEof(p) )
339
+    return false;
340
+
341
+  p->c += 1;
342
+
343
+  if( *p->c == '\n' )
344
+    p->line += 1;
345
+  
346
+  return true;
347
+}
348
+
349
+// Advance the cursor to the next non-white char
350
+// Return a pointer to a non-space character.
351
+// Return NULL if the EOF is encountered.
352
+const cmChar_t*  _cmXmlAdvanceToNextNonWhite( cmXml_t* p )
353
+{
354
+  if( _cmXmlIsEof(p) )
355
+    return NULL;
356
+  
357
+  while( isspace(*p->c) )
358
+    if( _cmXmlAdvance(p) == false )
359
+      return NULL;
360
+
361
+  return p->c;
362
+}
363
+
364
+// Advance to the next white space character or 'c'.
365
+// Returns a pointer to a white space or 'c'.
366
+const cmChar_t*  _cmXmlAdvanceToNextWhiteOr( cmXml_t* p, cmChar_t c0, cmChar_t c1 )
367
+{
368
+  if( _cmXmlIsEof(p) )
369
+    return NULL;
370
+
371
+  while( isspace(*p->c)==false && *p->c!=c0 && *p->c!=c1 )
372
+    if(_cmXmlAdvance(p) == false )
373
+      return NULL;
374
+
375
+  return p->c;
376
+}
377
+
378
+// Advance past leading white space followed by 's'.
379
+// Note that 's' is expected to immediately follow any leading white space.
380
+// Returns a pointer to the character after 's'.
381
+// Returns NULL if 'c' is not encountered
382
+const cmChar_t* _cmXmlAdvancePast( cmXml_t* p, const cmChar_t* s )
383
+{
384
+  if( _cmXmlIsEof(p) )
385
+    return NULL;
386
+
387
+  while( isspace(*p->c) )
388
+    if( _cmXmlAdvance(p) == false )
389
+      return NULL;
390
+
391
+  for(; *s && *p->c == *s; ++s )
392
+    if( _cmXmlAdvance(p) == false )
393
+      return NULL;
394
+
395
+  return *s==0 ? p->c : NULL; 
396
+}
397
+
398
+// Advance past the current character and then 
399
+// advance to the next occurrence of 's' and return
400
+// a pointer to the last char in 's'.
401
+const cmChar_t* _cmXmlAdvanceToNext( cmXml_t* p, cmChar_t* s )
402
+{
403
+  unsigned i = 0;
404
+  unsigned n = strlen(s);
405
+
406
+  while( _cmXmlAdvance(p) )
407
+  {
408
+    if( *p->c != s[i] )
409
+      i = 0;
410
+    else
411
+    {
412
+      i+= 1;
413
+      if( i == n )
414
+        break;
415
+    }
416
+  }
417
+  return p->c;
418
+}
419
+
420
+// Return the character following the current character.
421
+const cmChar_t* _cmXmlAdvanceOne( cmXml_t* p )
422
+{
423
+  if( _cmXmlIsEof(p) )
424
+    return NULL;
425
+
426
+  p->c += 1;
427
+
428
+  return _cmXmlIsEof(p) ? NULL : p->c;
429
+}
430
+
431
+cmXmlRC_t  _cmXmlParseAttr( cmXml_t* p, cmChar_t endChar )
432
+{
433
+  cmXmlRC_t       rc = kOkXmlRC;
434
+  const cmChar_t* l0 = NULL;
435
+  const cmChar_t* l1 = NULL;
436
+  const cmChar_t* v0 = NULL;
437
+  const cmChar_t* v1 = NULL;
438
+
439
+  // advance to the next label
440
+  if(( l0 = _cmXmlAdvanceToNextNonWhite(p)) == NULL )
441
+    return _cmXmlSyntaxError(p);
442
+
443
+  // if the 'endChar' was encountered
444
+  if( *p->c == endChar )
445
+    return kOkXmlRC;
446
+  
447
+  // advance past last character in label
448
+  if((l1 = _cmXmlAdvanceToNextWhiteOr(p,'=',' ')) == NULL )
449
+    return _cmXmlSyntaxError(p);
450
+
451
+  // advance past the next '='
452
+  if( _cmXmlAdvancePast(p,"=") == NULL )
453
+    return _cmXmlSyntaxError(p);
454
+  
455
+  // advance to the next non-white character
456
+  if( (v0 = _cmXmlAdvanceToNextNonWhite(p)) == NULL )
457
+    return _cmXmlSyntaxError(p);
458
+
459
+  // the first character in the value must be a single quote
460
+  if( *p->c != '\'' )
461
+    return _cmXmlSyntaxError(p);
462
+  
463
+  // advance to the next single quote
464
+  if( (v1 = _cmXmlAdvanceToNext(p,"'")) == NULL )
465
+    return _cmXmlSyntaxError(p);
466
+
467
+  // advance past the ending single quote
468
+  if( _cmXmlAdvanceOne(p) == NULL )
469
+    return _cmXmlSyntaxError(p);
470
+
471
+  // p->c now points just past the ending single quote
472
+  return rc;
473
+}
474
+
475
+cmXmlRC_t _cmXmlParseAttrList( cmXml_t* p, cmChar_t endChar )
476
+{
477
+  cmXmlRC_t rc = kOkXmlRC;
478
+
479
+  
480
+  while( *p->c != endChar && *p->c != '>' )
481
+    if((rc = _cmXmlParseAttr(p,endChar)) != kOkXmlRC )
482
+      break;
483
+
484
+  if( *p->c == endChar )
485
+  {
486
+    if( endChar = '/' )
487
+    {
488
+      // this is a simple node
489
+    }
490
+    
491
+    if( _cmXmlAdvanceOne(p) == NULL )
492
+      return _cmXmlSyntaxError(p);
493
+  }
494
+  
495
+  if( *p->c != '>' )
496
+    return _cmXmlSyntaxError(p);
497
+  
498
+  if( _cmXmlAdvancePast(p,">") == NULL )
499
+    return _cmXmlSyntaxError(p);
500
+
501
+  // p->c is now past the ending '>'
502
+  
503
+  return rc;
504
+}
505
+
506
+cmXmlRC_t _cmXmlParseDoctypeToken( cmXml_t* p, cmXmlNode_t* np )
507
+{
508
+  const cmChar_t* t0 = NULL;
509
+  const cmChar_t* t1 = NULL;
510
+
511
+  // advance to the first char in the doctype token
512
+  if((t0 = _cmXmlAdvanceToNextNonWhite(p) ) == NULL )
513
+  {
514
+    return _cmXmlSyntaxError(p);
515
+  }
516
+
517
+  // if the end of the tag was encountered
518
+  if( *p->c == '>' )
519
+      return kOkXmlRC;
520
+    
521
+
522
+  // if the token begins with a quote
523
+  if( *p->c == '\'' )
524
+  {
525
+    if((t1 = _cmXmlAdvanceToNext(p,"'")) == NULL )
526
+      return _cmXmlSyntaxError(p);
527
+  }
528
+  else
529
+  {
530
+    if((t1 = _cmXmlAdvanceToNextWhiteOr(p,'>',' ')) == NULL )
531
+      return _cmXmlSyntaxError(p);
532
+  }
533
+
534
+  // t1 and p->c now point just past the last character in the token
535
+
536
+  return rc;  
537
+}
538
+
539
+cmXmlRC_t _cmXmlParseDoctype( cmXml_t* p )
540
+{
541
+  cmXmlRC_t rc = kOkXmlRC;
542
+  cmXmlNode_t* np;
543
+  
544
+  if((np = _cmXmlNodeAlloc(p,kDoctypeXmlFl,"DOCTYPE",strlen("DOCTYPE"))) == NULL )
545
+    return cmErrLastRC(&p->err);
546
+  
547
+  while( *p->c != '>' )
548
+    if((rc = _cmXmlParseDoctypeToken(p,np)) != kOkXmlRC )
549
+      break;
550
+
551
+  return rc;
552
+}
553
+
554
+// Node tags are tags that begin with a '<' and are not
555
+// followed by any special character.
556
+cmXmlRC_t _cmXmlParseNodeTag( cmXml_t* p )
557
+{
558
+  cmXmlRC_t       rc = kOkXmlRC;
559
+  const cmChar_t* l0 = NULL;
560
+  const cmChar_t* l1 = NULL;
561
+
562
+  // Advance to the first character of the tag label.
563
+  if((l0 = _cmXmlAdvanceToNextNonWhite(p)) == NULL )
564
+    return _cmXmlSyntaxError(p);
565
+
566
+  // Advance to the last character following the tag label.
567
+  if((l1 = _cmXmlAdvanceToNextWhiteOr(p,'/','>')) == NULL )
568
+    return _cmXmlSyntaxError(p);
569
+
570
+  // look for attributes
571
+  if((rc = _cmXmlParseAttrList(p,'/')) != kOkXmlRC )
572
+      return _cmXmlSyntaxError(p);
573
+
574
+  // p->c is now past the ending '>'
575
+  
576
+  return rc;
577
+}
578
+
579
+cmXmlRC_t _cmXmlReadEndTag( cmXml_t* p )
580
+{
581
+  const cmChar_t* l0 = NULL;
582
+  const cmChar_t* l1 = NULL;
583
+
584
+  assert( *p->c == '/' );
585
+
586
+  // advance past the '/'
587
+  if(( l0 = _cmXmlAdvanceOne(p)) == NULL )
588
+    return _cmXmlSyntaxError(p);
589
+
590
+  // advance to the ending '>'
591
+  if(( l1 = _cmXmlAdvanceToNext(p,">")) == NULL )
592
+    return _cmXmlSyntaxError(p);
593
+
594
+  // advance past the 
595
+  if( _cmXmlAdvanceOne(p) == NULL )
596
+    return _cmXmlSyntaxError(p);
597
+
598
+  // trim trailing space on label
599
+  l1 -= 1;
600
+  while( l1>l0 && isspace(*l1) )
601
+    --l1;
602
+
603
+  // verify that the label has a length
604
+  if( l0 == l1 )
605
+    return _cmXmlSyntaxError(p);
606
+
607
+  assert( !isspace(*l1) );
608
+
609
+  // the label should match the node on the top of the stack
610
+  if( strncmp( p->stack->label, l0, (l1-l0)+1 ) )
611
+    return _cmXmlSyntaxError(p);
612
+
613
+  // since we just parsed an end-tag there should be at least one node on the stack
614
+  if( p->stack == NULL )
615
+    return _cmXmlSyntaxError(p);
616
+
617
+  // pop the stack
618
+  p->stack = p->stack->parent;
619
+  
620
+  return kOkXmlRC;  
621
+}
622
+  
623
+
624
+
625
+// 
626
+cmXmlRC_t  _cmXmlReadTag( cmXml_t* p, cmXmlNode_t** newNodeRef )
627
+{
628
+  cmXmlRC_t rc = kOkXmlRC;
629
+
630
+  assert(newNodeRef != NULL );
631
+  *newNodeRef = NULL;
632
+  
633
+  // No leading '<' was found 
634
+  if( _cmXmlAdvancePast(p,"<") == NULL )
635
+  {
636
+    // error or EOF
637
+    return NULL;  
638
+  }
639
+
640
+  // examine the character following the opening '<'
641
+  switch( *p->c )
642
+  {
643
+    // node end tag
644
+    case '/':
645
+      return _cmXmlReadEndTag(p);
646
+    
647
+    // declaration tag
648
+    case '?':
649
+      if( _cmXmlAdvancePast(p,"xml") == NULL )
650
+        return _cmXmlSyntaxError(p);
651
+
652
+      if( _cmXmlNodeAlloc(p,kDeclXmlFl, "xml",strlen("xml") ) == NULL )
653
+        return cmErrLastRC(&p->err);
654
+      
655
+      if((rc = _cmXmlParseAttrList(p,'?')) != kOkXmlRC )
656
+        return rc;
657
+      
658
+      break;
659
+      
660
+    case '!':
661
+      switch( *(p->c+1) )
662
+      {
663
+        // comment node
664
+        case '-':
665
+          if( _cmXmlAdvancePast(p,"--") == NULL )
666
+            return _cmXmlSyntaxError(p);
667
+        
668
+          if( _cmXmlAdvanceToNext("->") == NULL )
669
+            return _cmXmlSyntaxError(p);
670
+
671
+          // p->c is just after "-->"
672
+          break;
673
+          
674
+          // DOCTYPE node
675
+        case 'D':
676
+          if( _cmXmlAdvancePast(P,"DOCTYPE")==NULL )
677
+            return _cmXmlSyntaxError(p);
678
+        
679
+          if((rc = _cmXmlParseDocType(p)) != kOkXmlRC )
680
+            return _cmXmlSyntaxError(p);
681
+
682
+          // p->c is just after ">"
683
+
684
+          break;
685
+        
686
+        default:
687
+          return _cmXmlSyntaxError(p);
688
+      }
689
+      break;
690
+      
691
+    default:
692
+      // normal node
693
+      if((rc = _cmXmlParseNodeTag(p)) != kOkXmlRC )
694
+        return rc;
695
+
696
+      // p->c is just after ">"
697
+      
698
+  }
699
+  
700
+  return rc;
701
+}
702
+
703
+cmXmlRC_t  _cmXmlReadNode( cmXml_t* p )
704
+{
705
+}

+ 21
- 4
cmXml.h View File

@@ -10,22 +10,39 @@ extern "C" {
10 10
     kOkXmlRC = cmOkRC,
11 11
     kMemAllocErrXmlRC,
12 12
     kLHeapXmlRC,
13
-    kLexErrXmlRC
13
+    kLexErrXmlRC,
14
+    kSyntaxErrorXmlRC
14 15
   }; 
15 16
   
16 17
   typedef struct cmXmlAttr_str
17 18
   {
18
-    const cmChar_t*      label;
19
-    const cmChar_t*      value;
19
+    const cmChar_t*       label;
20
+    const cmChar_t*       value;    
20 21
     struct cmXmlAttr_str* link;
21 22
   } cmXmlAttr_t;
23
+
24
+  enum
25
+  {
26
+    kDeclXmlFl    = 0x0001,
27
+    kDoctypeXmlFl = 0x0002,
28
+    kNormalXmlFl  = 0x0004,
29
+    
30
+  };
22 31
   
23 32
   typedef struct cmXmlNode_str
24 33
   {
34
+    unsigned              flags;
35
+    
36
+    const cmChar_t*       label;
37
+    const cmChar_t*       dataStr;
38
+    
39
+    cmXmlAttr_t*          attr;
40
+    
25 41
     struct cmXmlNode_str* parent;
26 42
     struct cmXmlNode_str* children;
27 43
     struct cmXmlNode_str* sibling;
28
-    cmXmlAttr_t*          attr;
44
+    
45
+    
29 46
   } cmXmlNode_t;
30 47
 
31 48
   typedef cmHandle_t cmXmlH_t;

Loading…
Cancel
Save