src/dvipdfmx-pu/src/pdfparse.c

   1 /*
   2
   3     This is dvipdfmx, an eXtended version of dvipdfm by Mark A. Wicks.
   4
   5     Copyright (C) 2007-2012 by Jin-Hwan Cho and Shunsaku Hirata,
   6     the dvipdfmx project team.
   7
   8     Copyright (C) 1998, 1999 by Mark A. Wicks <mwicks@kettering.edu>
   9
  10     This program is free software; you can redistribute it and/or modify
  11     it under the terms of the GNU General Public License as published by
  12     the Free Software Foundation; either version 2 of the License, or
  13     (at your option) any later version.
  14
  15     This program is distributed in the hope that it will be useful,
  16     but WITHOUT ANY WARRANTY; without even the implied warranty of
  17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18     GNU General Public License for more details.
  19
  20     You should have received a copy of the GNU General Public License
  21     along with this program; if not, write to the Free Software
  22     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  23 */
  24
  25 #if HAVE_CONFIG_H
  26 #include "config.h"
  27 #endif
  28
  29 #include <ctype.h>
  30 #include <string.h>
  31
  32 #include "system.h"
  33 #include "mem.h"
  34 #include "error.h"
  35
  36 #include "numbers.h"
  37
  38 #include "mfileio.h"
  39
  40 #include "pdfobj.h"
  41 #include "pdfdoc.h"
  42 #include "pdfdev.h"
  43
  44 #include "pdfparse.h"
  45
  46 /* PDF */
  47 #ifdef  is_space
  48 #undef  is_space
  49 #endif
  50 #ifdef  is_delim
  51 #undef  is_delim
  52 #endif
  53
  54 #define is_space(c) ((c) == ' '  || (c) == '\t' || (c) == '\f' || \
  55                      (c) == '\r' || (c) == '\n' || (c) == '\0')
  56 #define is_delim(c) ((c) == '(' || (c) == '/' || \
  57                      (c) == '<' || (c) == '>' || \
  58                      (c) == '[' || (c) == ']' || \
  59                      (c) == '%')
  60 #define PDF_TOKEN_END(p,e) ((p) >= (e) || is_space(*(p)) || is_delim(*(p)))
  61
  62 #define istokensep(c) (is_space((c)) || is_delim((c)))
  63
  64 static struct {
  65   int tainted;
  66 } parser_state = {
  67   0
  68 };
  69
  70 static int xtoi (char ch);
  71
  72 static const char *save = NULL;
  73
  74 void
  75 dump (const char *start, const char *end)
  76 {
  77   const char *p = start;
  78
  79 #define DUMP_LIMIT 50
  80   MESG("\nCurrent input buffer is -->");
  81   while (p < end && p < start + DUMP_LIMIT)
  82     MESG("%c", *(p++));
  83   if (p == start+DUMP_LIMIT)
  84     MESG("...");
  85   MESG("<--\n");
  86 }
  87
  88 #define SAVE(s,e) do {\
  89    save = (s);\
  90  } while (0)
  91 #define DUMP_RESTORE(s,e) do {\
  92    dump(save, end);\
  93    (s) = save;\
  94  } while (0)
  95
  96 void
  97 skip_line (const char **start, const char *end)
  98 {
  99   while (*start < end && **start != '\n' && **start != '\r')
 100     (*start)++;
 101   /* The carriage return (CR; \r; 0x0D) and line feed (LF; \n; 0x0A)
 102    * characters, also called newline characters, are treated as
 103    * end-of-line (EOL) markers. The combination of a carriage return
 104    * followed immediately by a line feed is treated as one EOL marker.
 105    */
 106   if (*start < end && **start == '\r')
 107     (*start)++;
 108   if (*start < end && **start == '\n')
 109     (*start)++;
 110 }
 111
 112 void
 113 skip_white (const char **start, const char *end)
 114 {
 115   /*
 116    * The null (NUL; 0x00) character is a white-space character in PDF spec
 117    * but isspace(0x00) returns FALSE; on the other hand, the vertical tab
 118    * (VT; 0x0B) character is not a white-space character in PDF spec but
 119    * isspace(0x0B) returns TRUE.
 120    */
 121   while (*start < end && (is_space(**start) || **start == '%')) {
 122     if (**start == '%')
 123       skip_line(start, end);
 124     else
 125       (*start)++;
 126   }
 127 }
 128
 129
 130 static char *
 131 parsed_string (const char *start, const char *end)
 132 {
 133   char *result = NULL;
 134   int   len;
 135
 136   len = end - start;
 137   if (len > 0) {
 138     result = NEW(len + 1, char);
 139     memcpy(result, start, len);
 140     result[len] = '\0';
 141   }
 142
 143   return result;
 144 }
 145
 146 char *
 147 parse_number (const char **start, const char *end)
 148 {
 149   char *number;
 150   const char *p;
 151
 152   skip_white(start, end);
 153   p = *start;
 154   if (p < end && (*p == '+' || *p == '-'))
 155     p++;
 156   while (p < end && isdigit(*p))
 157     p++;
 158   if (p < end && *p == '.') {
 159     p++;
 160     while (p < end && isdigit(*p))
 161       p++;
 162   }
 163   number = parsed_string(*start, p);
 164
 165   *start = p;
 166   return number;
 167 }
 168
 169 char *
 170 parse_unsigned (const char **start, const char *end)
 171 {
 172   char *number;
 173   const char *p;
 174
 175   skip_white(start, end);
 176   for (p = *start; p < end; p++) {
 177     if (!isdigit(*p))
 178       break;
 179   }
 180   number = parsed_string(*start, p);
 181
 182   *start = p;
 183   return number;
 184 }
 185
 186 static char *
 187 parse_gen_ident (const char **start, const char *end, const char *valid_chars)
 188 {
 189   char *ident;
 190   const char *p;
 191
 192   /* No skip_white(start, end)? */
 193   for (p = *start; p < end; p++) {
 194     if (!strchr(valid_chars, *p))
 195       break;
 196   }
 197   ident = parsed_string(*start, p);
 198
 199   *start = p;
 200   return ident;
 201 }
 202
 203 char *
 204 parse_ident (const char **start, const char *end)
 205 {
 206   static const char *valid_chars =
 207     "!\"#$&'*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
 208
 209   return parse_gen_ident(start, end, valid_chars);
 210 }
 211
 212 char *
 213 parse_val_ident (const char **start, const char *end)
 214 {
 215   static const char *valid_chars =
 216     "!\"#$&'*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
 217
 218   return parse_gen_ident(start, end, valid_chars);
 219 }
 220
 221 char *
 222 parse_opt_ident (const char **start, const char *end)
 223 {
 224   if (*start < end && **start == '@') {
 225     (*start)++;
 226     return parse_ident(start, end);
 227   }
 228
 229   return NULL;
 230 }
 231
 232 #define DDIGITS_MAX 10
 233 pdf_obj *
 234 parse_pdf_number (const char **pp, const char *endptr)
 235 {
 236   const char *p;
 237   unsigned long ipart = 0, dpart = 0;
 238   int      nddigits = 0, sign = 1;
 239   int      has_dot = 0;
 240   static double ipot[DDIGITS_MAX+1] = {
 241     1.0,
 242     0.1,
 243     0.01,
 244     0.001,
 245     0.0001,
 246     0.00001,
 247     0.000001,
 248     0.0000001,
 249     0.00000001,
 250     0.000000001,
 251     0.0000000001
 252   };
 253
 254   p = *pp;
 255   skip_white(&p, endptr);
 256   if (p >= endptr ||
 257       (!isdigit(p[0]) && p[0] != '.' &&
 258        p[0] != '+' && p[0] != '-')) {
 259     WARN("Could not find a numeric object.");
 260     return NULL;
 261   }
 262
 263   if (p[0] == '-') {
 264     if (p + 1 >= endptr) {
 265       WARN("Could not find a numeric object.");
 266       return NULL;
 267     }
 268     sign = -1;
 269     p++;
 270   } else if (p[0] == '+') {
 271     if (p + 1 >= endptr) {
 272       WARN("Could not find a numeric object.");
 273       return NULL;
 274     }
 275     sign =  1;
 276     p++;
 277   }
 278
 279   while (p < endptr && !istokensep(p[0])) {
 280     if (p[0] == '.') {
 281       if (has_dot) { /* Two dots */
 282         WARN("Could not find a numeric object.");
 283         return NULL;
 284       } else {
 285         has_dot = 1;
 286       }
 287     } else if (isdigit(p[0])) {
 288       if (has_dot) {
 289         if (nddigits == DDIGITS_MAX && pdf_obj_get_verbose() > 1) {
 290           WARN("Number with more than %d fractional digits.", DDIGITS_MAX);
 291         } else if (nddigits < DDIGITS_MAX) {
 292           dpart = dpart * 10 + p[0] - '0';
 293           nddigits++;
 294         } /* Ignore decimal digits more than DDIGITS_MAX */
 295       } else {
 296         ipart = ipart * 10 + p[0] - '0';
 297       }
 298     } else {
 299       WARN("Could not find a numeric object.");
 300       return NULL;
 301     }
 302     p++;
 303   }
 304
 305   *pp = p;
 306   return pdf_new_number((double) sign * (((double ) ipart) + dpart * ipot[nddigits]));
 307 }
 308
 309 /*
 310  * PDF Name:
 311  *
 312  *  PDF-1.2+: Two hexadecimal digits preceded by a number sign.
 313  */
 314 static int
 315 pn_getc (const char **pp, const char *endptr)
 316 {
 317   int   ch = 0;
 318   const char *p;
 319
 320   p  = *pp;
 321   if (p[0] == '#') {
 322     if (p + 2 >= endptr) {
 323       *pp = endptr;
 324       return -1;
 325     }
 326     if (!isxdigit(p[1]) || !isxdigit(p[2])) {
 327       *pp += 3;
 328       return -1;
 329     }
 330     ch   = (xtoi(p[1]) << 4);
 331     ch  += xtoi(p[2]);
 332     *pp += 3;
 333   } else {
 334     ch = p[0];
 335     *pp += 1;
 336   }
 337
 338   return ch;
 339 }
 340
 341 #ifndef PDF_NAME_LEN_MAX
 342 #define PDF_NAME_LEN_MAX 128
 343 #endif
 344
 345 #ifndef PDF_STRING_LEN_MAX
 346 #define PDF_STRING_LEN_MAX 65535
 347 #endif
 348
 349 #define STRING_BUFFER_SIZE PDF_STRING_LEN_MAX+1
 350 static char sbuf[PDF_STRING_LEN_MAX+1];
 351
 352
 353 pdf_obj *
 354 parse_pdf_name (const char **pp, const char *endptr)
 355 {
 356   char  name[PDF_NAME_LEN_MAX+1];
 357   int   ch, len = 0;
 358
 359   skip_white(pp, endptr);
 360   if (*pp >= endptr || **pp != '/') {
 361     WARN("Could not find a name object.");
 362     return NULL;
 363   }
 364
 365   (*pp)++;
 366   while (*pp < endptr && !istokensep(**pp)) {
 367     ch = pn_getc(pp, endptr);
 368     if (ch < 0 || ch > 0xff) {
 369       WARN("Invalid char in PDF name object. (ignored)");
 370     } else if (ch == 0) {
 371       WARN("Null char not allowed in PDF name object. (ignored)");
 372     } else if (len < STRING_BUFFER_SIZE) {
 373       if (len == PDF_NAME_LEN_MAX) {
 374         WARN("PDF name length too long. (>= %d bytes)", PDF_NAME_LEN_MAX);
 375       }
 376       name[len++] = ch;
 377     } else {
 378       WARN("PDF name length too long. (>= %d bytes, truncated)",
 379            STRING_BUFFER_SIZE);
 380     }
 381   }
 382   if (len < 1) {
 383     WARN("No valid name object found.");
 384     return NULL;
 385   }
 386   name[len] = '\0';
 387
 388   return pdf_new_name(name);
 389 }
 390
 391 pdf_obj *
 392 parse_pdf_boolean (const char **pp, const char *endptr)
 393 {
 394   skip_white(pp, endptr);
 395   if (*pp + 4 <= endptr &&
 396       !strncmp(*pp, "true", 4)) {
 397     if (*pp + 4 == endptr ||
 398         istokensep(*(*pp + 4))) {
 399       *pp += 4;
 400       return pdf_new_boolean(1);
 401     }
 402   } else if (*pp + 5 <= endptr &&
 403              !strncmp(*pp, "false", 5)) {
 404     if (*pp + 5 == endptr ||
 405         istokensep(*(*pp + 5))) {
 406       *pp += 5;
 407       return pdf_new_boolean(0);
 408     }
 409   }
 410
 411   WARN("Not a boolean object.");
 412
 413   return NULL;
 414 }
 415
 416 pdf_obj *
 417 parse_pdf_null (const char **pp, const char *endptr)
 418 {
 419   skip_white(pp, endptr);
 420   if (*pp + 4 > endptr) {
 421     WARN("Not a null object.");
 422     return NULL;
 423   } else if (*pp + 4 < endptr &&
 424              !istokensep(*(*pp+4))) {
 425     WARN("Not a null object.");
 426     return NULL;
 427   } else if (!strncmp(*pp, "null", 4)) {
 428     *pp += 4;
 429     return pdf_new_null();
 430   }
 431
 432   WARN("Not a null object.");
 433
 434   return NULL;
 435 }
 436
 437 /*
 438  * PDF Literal String
 439  */
 440 #ifndef isodigit
 441 #define isodigit(c) ((c) >= '0' && (c) <= '7')
 442 #endif
 443 static int
 444 ps_getescc (const char **pp, const char *endptr)
 445 {
 446   int   ch, i;
 447   const char *p;
 448
 449   p = *pp + 1; /* backslash assumed. */
 450   switch (p[0]) {
 451   case 'n': ch = '\n'; p++; break;
 452   case 'r': ch = '\r'; p++; break;
 453   case 't': ch = '\t'; p++; break;
 454   case 'b': ch = '\b'; p++; break;
 455   case 'f': ch = '\f'; p++; break;
 456
 457     /*
 458      * An end-of-line marker preceded by a backslash must be ignored.
 459      */
 460   case '\n':
 461     ch = -1;
 462     p++;
 463     break;
 464   case '\r':
 465     ch = -1;
 466     p++;
 467     if (p < endptr && p[0] == '\n')
 468       p++;
 469     break;
 470   default:
 471     if (p[0] == '\\' ||
 472         p[0] == '('  || p[0] == ')') {
 473       ch = p[0];
 474       p++;
 475     } else if (isodigit(p[0])) {
 476       ch = 0;
 477       /* Don't forget isodigit() is a macro. */
 478       for (i = 0; i < 3 &&
 479              p < endptr && isodigit(p[0]); i++) {
 480         ch = (ch << 3) + (p[0] - '0');
 481         p++;
 482       }
 483       ch = (ch & 0xff); /* Ignore overflow. */
 484     } else {
 485       ch = ((unsigned char) p[0]); /* Ignore only backslash. */
 486       p++;
 487     }
 488   }
 489
 490   *pp = p;
 491   return ch;
 492 }
 493
 494 static pdf_obj *
 495 parse_pdf_literal_string (const char **pp, const char *endptr)
 496 {
 497   int    ch, op_count = 0, len = 0;
 498   const char  *p;
 499
 500   p = *pp;
 501
 502   skip_white(&p, endptr);
 503
 504   if (p >= endptr || p[0] != '(')
 505     return NULL;
 506
 507   p++;
 508
 509   /* The carriage return (CR, 0x0d) and line feed (LF, 0x0a) characters,
 510    * also called newline characters, are treated as end-of-line (EOL)
 511    * markers. The combination of a carriage return followed immediately
 512    * by a line feed is treated as one EOL marker.
 513    * [PDF Reference, 6th ed., version 1.7, p. 50] */
 514
 515   /* If an end-of-line marker appears within a literal string
 516    * without a preceding backslash, the result is equivalent to
 517    * \n (regardless of whether the end-of-line marker was
 518    * a carriage return, a line feed, or both).
 519    * [PDF Reference, 6th ed., version 1.7, p. 55] */
 520
 521   while (p < endptr) {
 522
 523     ch = p[0];
 524
 525     if (ch == ')' && op_count < 1)
 526       break;
 527
 528 #ifndef PDF_PARSE_STRICT
 529     if (parser_state.tainted) {
 530       if (p + 1 < endptr && (ch & 0x80)) {
 531         if (len + 2 >= PDF_STRING_LEN_MAX) {
 532           WARN("PDF string length too long. (limit: %ld)",
 533                PDF_STRING_LEN_MAX);
 534           return NULL;
 535         }
 536         sbuf[len++] = p[0];
 537         sbuf[len++] = p[1];
 538         p += 2;
 539         continue;
 540       }
 541     }
 542 #endif /* !PDF_PARSE_STRICT */
 543
 544     if (len + 1 >= PDF_STRING_LEN_MAX) {
 545       WARN("PDF string length too long. (limit: %ld)",
 546            PDF_STRING_LEN_MAX);
 547       return NULL;
 548     }
 549
 550     switch (ch) {
 551     case '\\':
 552       ch = ps_getescc(&p, endptr);
 553       if (ch >= 0)
 554         sbuf[len++] = (ch & 0xff);
 555       break;
 556     case '\r':
 557       p++;
 558       if (p < endptr && p[0] == '\n')
 559         p++;
 560       sbuf[len++] = '\n';
 561       break;
 562     default:
 563       if (ch == '(')
 564         op_count++;
 565       else if (ch == ')')
 566         op_count--;
 567       sbuf[len++] = ch;
 568       p++;
 569       break;
 570     }
 571   }
 572
 573   if (op_count > 0 ||
 574       p >= endptr  || p[0] != ')') {
 575     WARN("Unbalanced parens/truncated PDF literal string.");
 576     return NULL;
 577   }
 578
 579   *pp = p + 1;
 580   return pdf_new_string(sbuf, len);
 581 }
 582
 583 /*
 584  * PDF Hex String
 585  */
 586 static int
 587 xtoi (char ch)
 588 {
 589   if (ch >= '0' && ch <= '9')
 590     return ch - '0';
 591   if (ch >= 'A' && ch <= 'F')
 592     return (ch - 'A') + 10;
 593   if (ch >= 'a' && ch <= 'f')
 594     return (ch - 'a') + 10;
 595
 596   return -1;
 597 }
 598
 599 static pdf_obj *
 600 parse_pdf_hex_string (const char **pp, const char *endptr)
 601 {
 602   const char  *p;
 603   long   len;
 604
 605   p = *pp;
 606
 607   skip_white(&p, endptr);
 608   if (p >= endptr || p[0] != '<')
 609     return NULL;
 610
 611   p++;
 612
 613   len = 0;
 614   /*
 615    * PDF Reference does not describe how to treat invalid char.
 616    * Zero is appended if final hex digit is missing.
 617    */
 618   while (p < endptr && p[0] != '>' && len < PDF_STRING_LEN_MAX) {
 619     int  ch;
 620
 621     skip_white(&p, endptr);
 622     if (p >= endptr || p[0] == '>')
 623       break;
 624
 625     ch = (xtoi(p[0]) << 4);
 626     p++;
 627
 628     skip_white(&p, endptr);
 629     if (p < endptr && p[0] != '>') {
 630       ch += xtoi(p[0]);
 631       p++;
 632     }
 633     sbuf[len++] = (ch & 0xff);
 634   }
 635
 636   if (p >= endptr) {
 637     WARN("Premature end of input hex string.");
 638     return NULL;
 639   } else if (p[0] != '>') {
 640     WARN("PDF string length too long. (limit: %ld)", PDF_STRING_LEN_MAX);
 641     return NULL;
 642   }
 643
 644   *pp = p + 1;
 645   return pdf_new_string(sbuf, len);
 646 }
 647
 648 pdf_obj *
 649 parse_pdf_string (const char **pp, const char *endptr)
 650 {
 651   skip_white(pp, endptr);
 652   if (*pp + 2 <= endptr) {
 653     if (**pp == '(')
 654       return parse_pdf_literal_string(pp, endptr);
 655     else if (**pp == '<' &&
 656              (*(*pp + 1) == '>' || isxdigit(*(*pp + 1)))) {
 657       return parse_pdf_hex_string(pp, endptr);
 658     }
 659   }
 660
 661   WARN("Could not find a string object.");
 662
 663   return NULL;
 664 }
 665
 666 #ifndef PDF_PARSE_STRICT
 667 pdf_obj *
 668 parse_pdf_tainted_dict (const char **pp, const char *endptr)
 669 {
 670   pdf_obj *result;
 671
 672   parser_state.tainted = 1;
 673   result  = parse_pdf_dict(pp, endptr, NULL);
 674   parser_state.tainted = 0;
 675
 676   return result;
 677 }
 678 #else /* PDF_PARSE_STRICT */
 679 pdf_obj *
 680 parse_pdf_tainted_dict (const char **pp, const char *endptr)
 681 {
 682   return parse_pdf_dict(pp, endptr, NULL);
 683 }
 684 #endif /* !PDF_PARSE_STRICT */
 685
 686 pdf_obj *
 687 parse_pdf_dict (const char **pp, const char *endptr, pdf_file *pf)
 688 {
 689   pdf_obj *result = NULL;
 690   const char *p;
 691
 692   p = *pp;
 693
 694   skip_white(&p, endptr);
 695
 696   /* At least four letter <<>>. */
 697   if (p + 4 > endptr ||
 698       p[0] != '<'    || p[1] != '<') {
 699     return NULL;
 700   }
 701   p += 2;
 702
 703   result = pdf_new_dict();
 704
 705   skip_white(&p, endptr);
 706   while (p < endptr && p[0] != '>') {
 707     pdf_obj *key, *value;
 708
 709     skip_white(&p, endptr);
 710     key = parse_pdf_name(&p, endptr);
 711     if (!key) {
 712       WARN("Could not find a key in dictionary object.");
 713       pdf_release_obj(result);
 714       return NULL;
 715     }
 716
 717     skip_white(&p, endptr);
 718
 719     value = parse_pdf_object(&p, endptr, pf);
 720     if (!value) {
 721       pdf_release_obj(key);
 722       pdf_release_obj(value);
 723       pdf_release_obj(result);
 724       WARN("Could not find a value in dictionary object.");
 725       return NULL;
 726     }
 727     pdf_add_dict(result, key, value);
 728
 729     skip_white(&p, endptr);
 730   }
 731
 732   if (p + 2 > endptr ||
 733       p[0] != '>'    || p[1] != '>') {
 734     WARN("Syntax error: Dictionary object ended prematurely.");
 735     pdf_release_obj(result);
 736     return NULL;
 737   }
 738
 739   *pp = p + 2; /* skip >> */
 740   return result;
 741 }
 742
 743 pdf_obj *
 744 parse_pdf_array (const char **pp, const char *endptr, pdf_file *pf)
 745 {
 746   pdf_obj *result;
 747   const char *p;
 748
 749   p = *pp;
 750
 751   skip_white(&p, endptr);
 752   if (p + 2 > endptr || p[0] != '[') {
 753     WARN("Could not find an array object.");
 754     return NULL;
 755   }
 756
 757   result = pdf_new_array();
 758
 759   p++;
 760   skip_white(&p, endptr);
 761
 762   while (p < endptr && p[0] != ']') {
 763     pdf_obj *elem;
 764
 765     elem = parse_pdf_object(&p, endptr, pf);
 766     if (!elem) {
 767       pdf_release_obj(result);
 768       WARN("Could not find a valid object in array object.");
 769       return NULL;
 770     }
 771     pdf_add_array(result, elem);
 772
 773     skip_white(&p, endptr);
 774   }
 775
 776   if (p >= endptr || p[0] != ']') {
 777     WARN("Array object ended prematurely.");
 778     pdf_release_obj(result);
 779     return NULL;
 780   }
 781
 782   *pp = p + 1; /* skip ] */
 783   return result;
 784 }
 785
 786 static pdf_obj *
 787 parse_pdf_stream (const char **pp, const char *endptr, pdf_obj *dict, pdf_file *pf)
 788 {
 789   pdf_obj *result = NULL;
 790   const char *p;
 791   pdf_obj *stream_dict;
 792   long     stream_length;
 793
 794   p = *pp;
 795   skip_white(&p, endptr);
 796   if (p + 6 > endptr ||
 797       strncmp(p, "stream", 6)) {
 798     return NULL;
 799   }
 800   p += 6;
 801
 802   /* The keyword stream that follows the stream dictionary
 803    * should be followed by an end-of-line marker consisting of
 804    * either a carriage return (0x0D;\r) and a line feed (0x0A;\n)
 805    * or just a line feed, and not by a carriage return alone.
 806    * [PDF Reference, 6th ed., version 1.7, pp. 60-61] */
 807
 808   /* Notice that TeX translates an end-of-line marker to a single space. */
 809   if (p < endptr && p[0] == '\n') {
 810     p++;
 811   } else if (p + 1 < endptr &&
 812              (p[0] == '\r' && p[1] == '\n')) {
 813     p += 2;
 814   }
 815
 816   /* Stream length */
 817   {
 818     pdf_obj *tmp, *tmp2;
 819
 820     tmp = pdf_lookup_dict(dict, "Length");
 821
 822     if (tmp != NULL) {
 823       tmp2 = pdf_deref_obj(tmp);
 824       if (pdf_obj_typeof(tmp2) != PDF_NUMBER)
 825         stream_length = -1;
 826       else {
 827         stream_length = (long) pdf_number_value(tmp2);
 828       }
 829       pdf_release_obj(tmp2);
 830     }
 831     else {
 832       return NULL;
 833     }
 834   }
 835
 836
 837   if (stream_length < 0 ||
 838       p + stream_length > endptr)
 839     return NULL;
 840
 841   /*
 842    * If Filter is not applied, set STREAM_COMPRESS flag.
 843    * Should we use filter for ASCIIHexEncode/ASCII85Encode-ed streams?
 844    */
 845   {
 846     pdf_obj *filters;
 847
 848     filters = pdf_lookup_dict(dict, "Filter");
 849     if (!filters && stream_length > 10) {
 850       result = pdf_new_stream(STREAM_COMPRESS);
 851     } else {
 852       result = pdf_new_stream(0);
 853     }
 854   }
 855
 856   stream_dict = pdf_stream_dict(result);
 857   pdf_merge_dict(stream_dict, dict);
 858
 859   pdf_add_stream(result, p, stream_length);
 860   p += stream_length;
 861
 862   /* Check "endsteam" */
 863   {
 864     /* It is recommended that there be an end-of-line marker
 865      * after the data and before endstream; this marker is not included
 866      * in the stream length.
 867      * [PDF Reference, 6th ed., version 1.7, pp. 61] */
 868     if (p < endptr && p[0] == '\r')
 869       p++;
 870     if (p < endptr && p[0] == '\n')
 871       p++;
 872
 873     if (p + 9 > endptr ||
 874         memcmp(p, "endstream", 9)) {
 875       pdf_release_obj(result);
 876       return NULL;
 877     }
 878     p += 9;
 879   }
 880
 881   *pp = p;
 882   return  result;
 883 }
 884
 885 #ifndef PDF_PARSE_STRICT
 886
 887 /* PLEASE REMOVE THIS */
 888 #include "specials.h"
 889
 890 /* This is not PDF indirect reference. */
 891 static pdf_obj *
 892 parse_pdf_reference (const char **start, const char *end)
 893 {
 894   pdf_obj *result = NULL;
 895   char    *name;
 896
 897   SAVE(*start, end);
 898
 899   skip_white(start, end);
 900   name = parse_opt_ident(start, end);
 901   if (name) {
 902     result = spc_lookup_reference(name);
 903     if (!result) {
 904       WARN("Could not find the named reference (@%s).", name);
 905       DUMP_RESTORE(*start, end);
 906     }
 907     RELEASE(name);
 908   } else {
 909     WARN("Could not find a reference name.");
 910     DUMP_RESTORE(*start, end);
 911     result = NULL;
 912   }
 913
 914   return result;
 915 }
 916 #endif /* !PDF_PARSE_STRICT */
 917
 918 static pdf_obj *
 919 try_pdf_reference (const char *start, const char *end, const char **endptr, pdf_file *pf)
 920 {
 921   unsigned long id = 0;
 922   unsigned short gen = 0;
 923
 924   ASSERT(pf);
 925
 926   if (endptr)
 927     *endptr = start;
 928
 929   skip_white(&start, end);
 930   if (start > end - 5 || !isdigit(*start)) {
 931     return NULL;
 932   }
 933   while (!is_space(*start)) {
 934     if (start >= end || !isdigit(*start)) {
 935       return NULL;
 936     }
 937     id = id * 10 + (*start - '0');
 938     start++;
 939   }
 940
 941   skip_white(&start, end);
 942   if (start >= end || !isdigit(*start))
 943     return NULL;
 944   while (!is_space(*start)) {
 945     if (start >= end || !isdigit(*start))
 946       return NULL;
 947     gen = gen * 10 + (*start - '0');
 948     start++;
 949   }
 950
 951   skip_white(&start, end);
 952   if (start >= end  || *start != 'R')
 953     return NULL;
 954   start++;
 955   if (!PDF_TOKEN_END(start, end))
 956     return NULL;
 957
 958   if (endptr)
 959     *endptr = start;
 960
 961   return pdf_new_indirect(pf, id, gen);
 962 }
 963
 964 pdf_obj *
 965 parse_pdf_object (const char **pp, const char *endptr, pdf_file *pf)
 966 /* If pf is NULL, then indirect references are not allowed */
 967 {
 968   pdf_obj *result = NULL;
 969   const char *nextptr;
 970
 971   skip_white(pp, endptr);
 972   if (*pp >= endptr) {
 973     WARN("Could not find any valid object.");
 974     return NULL;
 975   }
 976
 977   switch (**pp) {
 978
 979   case '<':
 980
 981     if (*(*pp + 1) != '<') {
 982       result = parse_pdf_hex_string(pp, endptr);
 983     } else {
 984       pdf_obj *dict;
 985
 986       result = parse_pdf_dict(pp, endptr, pf);
 987       skip_white(pp, endptr);
 988       if ( result &&
 989           *pp <= endptr - 15 &&
 990           !memcmp(*pp, "stream", 6)) {
 991         dict   = result;
 992         result = parse_pdf_stream(pp, endptr, dict, pf);
 993         pdf_release_obj(dict);
 994       }
 995     }
 996
 997     break;
 998   case '(':
 999     result = parse_pdf_string(pp, endptr);
1000     break;
1001   case '[':
1002     result = parse_pdf_array(pp, endptr, pf);
1003     break;
1004   case '/':
1005     result = parse_pdf_name(pp, endptr);
1006     break;
1007   case 'n':
1008     result = parse_pdf_null(pp, endptr);
1009     break;
1010   case 't': case 'f':
1011     result = parse_pdf_boolean(pp, endptr);
1012     break;
1013   case '+': case '-': case '.':
1014     result = parse_pdf_number(pp, endptr);
1015     break;
1016   case '0': case '1': case '2': case '3': case '4':
1017   case '5': case '6': case '7': case '8': case '9':
1018
1019     /*
1020      * If pf != NULL, then we are parsing a PDF file,
1021      * and indirect references are allowed.
1022      */
1023     if (pf && (result = try_pdf_reference(*pp, endptr, &nextptr, pf))) {
1024       *pp = nextptr;
1025     } else {
1026       result = parse_pdf_number(pp, endptr);
1027     }
1028     break;
1029
1030   case '@':
1031
1032 #ifndef PDF_PARSE_STRICT
1033     result = parse_pdf_reference(pp, endptr);
1034 #endif /* !PDF_PARSE_STRICT */
1035     break;
1036
1037   default:
1038     WARN("Unknown PDF object type.");
1039     result = NULL;
1040   }
1041
1042   return result;
1043 }
1044