3 This is dvipdfmx, an eXtended version of dvipdfm by Mark A. Wicks.
5 Copyright (C) 2007-2012 by Jin-Hwan Cho and Shunsaku Hirata,
6 the dvipdfmx project team.
8 Copyright (C) 1998, 1999 by Mark A. Wicks <mwicks@kettering.edu>
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 2 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
54 #define is_space(c) ((c) == ' ' || (c) == '\t' || (c) == '\f' || \
55 (c) == '\r' || (c) == '\n' || (c) == '\0')
56 #define is_delim(c) ((c) == '(' || (c) == '/' || \
57 (c) == '<' || (c) == '>' || \
58 (c) == '[' || (c) == ']' || \
60 #define PDF_TOKEN_END(p,e) ((p) >= (e) || is_space(*(p)) || is_delim(*(p)))
62 #define istokensep(c) (is_space((c)) || is_delim((c)))
70 static int xtoi (char ch);
72 static const char *save = NULL;
75 dump (const char *start, const char *end)
77 const char *p = start;
80 MESG("\nCurrent input buffer is -->");
81 while (p < end && p < start + DUMP_LIMIT)
83 if (p == start+DUMP_LIMIT)
88 #define SAVE(s,e) do {\
91 #define DUMP_RESTORE(s,e) do {\
97 skip_line (const char **start, const char *end)
99 while (*start < end && **start != '\n' && **start != '\r')
101 /* The carriage return (CR; \r; 0x0D) and line feed (LF; \n; 0x0A)
102 * characters, also called newline characters, are treated as
103 * end-of-line (EOL) markers. The combination of a carriage return
104 * followed immediately by a line feed is treated as one EOL marker.
106 if (*start < end && **start == '\r')
108 if (*start < end && **start == '\n')
113 skip_white (const char **start, const char *end)
116 * The null (NUL; 0x00) character is a white-space character in PDF spec
117 * but isspace(0x00) returns FALSE; on the other hand, the vertical tab
118 * (VT; 0x0B) character is not a white-space character in PDF spec but
119 * isspace(0x0B) returns TRUE.
121 while (*start < end && (is_space(**start) || **start == '%')) {
123 skip_line(start, end);
131 parsed_string (const char *start, const char *end)
138 result = NEW(len + 1, char);
139 memcpy(result, start, len);
147 parse_number (const char **start, const char *end)
152 skip_white(start, end);
154 if (p < end && (*p == '+' || *p == '-'))
156 while (p < end && isdigit(*p))
158 if (p < end && *p == '.') {
160 while (p < end && isdigit(*p))
163 number = parsed_string(*start, p);
170 parse_unsigned (const char **start, const char *end)
175 skip_white(start, end);
176 for (p = *start; p < end; p++) {
180 number = parsed_string(*start, p);
187 parse_gen_ident (const char **start, const char *end, const char *valid_chars)
192 /* No skip_white(start, end)? */
193 for (p = *start; p < end; p++) {
194 if (!strchr(valid_chars, *p))
197 ident = parsed_string(*start, p);
204 parse_ident (const char **start, const char *end)
206 static const char *valid_chars =
207 "!\"#$&'*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
209 return parse_gen_ident(start, end, valid_chars);
213 parse_val_ident (const char **start, const char *end)
215 static const char *valid_chars =
216 "!\"#$&'*+,-./0123456789:;?@ABCDEFGHIJKLMNOPQRSTUVWXYZ\\^_`abcdefghijklmnopqrstuvwxyz|~";
218 return parse_gen_ident(start, end, valid_chars);
222 parse_opt_ident (const char **start, const char *end)
224 if (*start < end && **start == '@') {
226 return parse_ident(start, end);
232 #define DDIGITS_MAX 10
234 parse_pdf_number (const char **pp, const char *endptr)
237 unsigned long ipart = 0, dpart = 0;
238 int nddigits = 0, sign = 1;
240 static double ipot[DDIGITS_MAX+1] = {
255 skip_white(&p, endptr);
257 (!isdigit(p[0]) && p[0] != '.' &&
258 p[0] != '+' && p[0] != '-')) {
259 WARN("Could not find a numeric object.");
264 if (p + 1 >= endptr) {
265 WARN("Could not find a numeric object.");
270 } else if (p[0] == '+') {
271 if (p + 1 >= endptr) {
272 WARN("Could not find a numeric object.");
279 while (p < endptr && !istokensep(p[0])) {
281 if (has_dot) { /* Two dots */
282 WARN("Could not find a numeric object.");
287 } else if (isdigit(p[0])) {
289 if (nddigits == DDIGITS_MAX && pdf_obj_get_verbose() > 1) {
290 WARN("Number with more than %d fractional digits.", DDIGITS_MAX);
291 } else if (nddigits < DDIGITS_MAX) {
292 dpart = dpart * 10 + p[0] - '0';
294 } /* Ignore decimal digits more than DDIGITS_MAX */
296 ipart = ipart * 10 + p[0] - '0';
299 WARN("Could not find a numeric object.");
306 return pdf_new_number((double) sign * (((double ) ipart) + dpart * ipot[nddigits]));
312 * PDF-1.2+: Two hexadecimal digits preceded by a number sign.
315 pn_getc (const char **pp, const char *endptr)
322 if (p + 2 >= endptr) {
326 if (!isxdigit(p[1]) || !isxdigit(p[2])) {
330 ch = (xtoi(p[1]) << 4);
341 #ifndef PDF_NAME_LEN_MAX
342 #define PDF_NAME_LEN_MAX 128
345 #ifndef PDF_STRING_LEN_MAX
346 #define PDF_STRING_LEN_MAX 65535
349 #define STRING_BUFFER_SIZE PDF_STRING_LEN_MAX+1
350 static char sbuf[PDF_STRING_LEN_MAX+1];
354 parse_pdf_name (const char **pp, const char *endptr)
356 char name[PDF_NAME_LEN_MAX+1];
359 skip_white(pp, endptr);
360 if (*pp >= endptr || **pp != '/') {
361 WARN("Could not find a name object.");
366 while (*pp < endptr && !istokensep(**pp)) {
367 ch = pn_getc(pp, endptr);
368 if (ch < 0 || ch > 0xff) {
369 WARN("Invalid char in PDF name object. (ignored)");
370 } else if (ch == 0) {
371 WARN("Null char not allowed in PDF name object. (ignored)");
372 } else if (len < STRING_BUFFER_SIZE) {
373 if (len == PDF_NAME_LEN_MAX) {
374 WARN("PDF name length too long. (>= %d bytes)", PDF_NAME_LEN_MAX);
378 WARN("PDF name length too long. (>= %d bytes, truncated)",
383 WARN("No valid name object found.");
388 return pdf_new_name(name);
392 parse_pdf_boolean (const char **pp, const char *endptr)
394 skip_white(pp, endptr);
395 if (*pp + 4 <= endptr &&
396 !strncmp(*pp, "true", 4)) {
397 if (*pp + 4 == endptr ||
398 istokensep(*(*pp + 4))) {
400 return pdf_new_boolean(1);
402 } else if (*pp + 5 <= endptr &&
403 !strncmp(*pp, "false", 5)) {
404 if (*pp + 5 == endptr ||
405 istokensep(*(*pp + 5))) {
407 return pdf_new_boolean(0);
411 WARN("Not a boolean object.");
417 parse_pdf_null (const char **pp, const char *endptr)
419 skip_white(pp, endptr);
420 if (*pp + 4 > endptr) {
421 WARN("Not a null object.");
423 } else if (*pp + 4 < endptr &&
424 !istokensep(*(*pp+4))) {
425 WARN("Not a null object.");
427 } else if (!strncmp(*pp, "null", 4)) {
429 return pdf_new_null();
432 WARN("Not a null object.");
441 #define isodigit(c) ((c) >= '0' && (c) <= '7')
444 ps_getescc (const char **pp, const char *endptr)
449 p = *pp + 1; /* backslash assumed. */
451 case 'n': ch = '\n'; p++; break;
452 case 'r': ch = '\r'; p++; break;
453 case 't': ch = '\t'; p++; break;
454 case 'b': ch = '\b'; p++; break;
455 case 'f': ch = '\f'; p++; break;
458 * An end-of-line marker preceded by a backslash must be ignored.
467 if (p < endptr && p[0] == '\n')
472 p[0] == '(' || p[0] == ')') {
475 } else if (isodigit(p[0])) {
477 /* Don't forget isodigit() is a macro. */
479 p < endptr && isodigit(p[0]); i++) {
480 ch = (ch << 3) + (p[0] - '0');
483 ch = (ch & 0xff); /* Ignore overflow. */
485 ch = ((unsigned char) p[0]); /* Ignore only backslash. */
495 parse_pdf_literal_string (const char **pp, const char *endptr)
497 int ch, op_count = 0, len = 0;
502 skip_white(&p, endptr);
504 if (p >= endptr || p[0] != '(')
509 /* The carriage return (CR, 0x0d) and line feed (LF, 0x0a) characters,
510 * also called newline characters, are treated as end-of-line (EOL)
511 * markers. The combination of a carriage return followed immediately
512 * by a line feed is treated as one EOL marker.
513 * [PDF Reference, 6th ed., version 1.7, p. 50] */
515 /* If an end-of-line marker appears within a literal string
516 * without a preceding backslash, the result is equivalent to
517 * \n (regardless of whether the end-of-line marker was
518 * a carriage return, a line feed, or both).
519 * [PDF Reference, 6th ed., version 1.7, p. 55] */
525 if (ch == ')' && op_count < 1)
528 #ifndef PDF_PARSE_STRICT
529 if (parser_state.tainted) {
530 if (p + 1 < endptr && (ch & 0x80)) {
531 if (len + 2 >= PDF_STRING_LEN_MAX) {
532 WARN("PDF string length too long. (limit: %ld)",
542 #endif /* !PDF_PARSE_STRICT */
544 if (len + 1 >= PDF_STRING_LEN_MAX) {
545 WARN("PDF string length too long. (limit: %ld)",
552 ch = ps_getescc(&p, endptr);
554 sbuf[len++] = (ch & 0xff);
558 if (p < endptr && p[0] == '\n')
574 p >= endptr || p[0] != ')') {
575 WARN("Unbalanced parens/truncated PDF literal string.");
580 return pdf_new_string(sbuf, len);
589 if (ch >= '0' && ch <= '9')
591 if (ch >= 'A' && ch <= 'F')
592 return (ch - 'A') + 10;
593 if (ch >= 'a' && ch <= 'f')
594 return (ch - 'a') + 10;
600 parse_pdf_hex_string (const char **pp, const char *endptr)
607 skip_white(&p, endptr);
608 if (p >= endptr || p[0] != '<')
615 * PDF Reference does not describe how to treat invalid char.
616 * Zero is appended if final hex digit is missing.
618 while (p < endptr && p[0] != '>' && len < PDF_STRING_LEN_MAX) {
621 skip_white(&p, endptr);
622 if (p >= endptr || p[0] == '>')
625 ch = (xtoi(p[0]) << 4);
628 skip_white(&p, endptr);
629 if (p < endptr && p[0] != '>') {
633 sbuf[len++] = (ch & 0xff);
637 WARN("Premature end of input hex string.");
639 } else if (p[0] != '>') {
640 WARN("PDF string length too long. (limit: %ld)", PDF_STRING_LEN_MAX);
645 return pdf_new_string(sbuf, len);
649 parse_pdf_string (const char **pp, const char *endptr)
651 skip_white(pp, endptr);
652 if (*pp + 2 <= endptr) {
654 return parse_pdf_literal_string(pp, endptr);
655 else if (**pp == '<' &&
656 (*(*pp + 1) == '>' || isxdigit(*(*pp + 1)))) {
657 return parse_pdf_hex_string(pp, endptr);
661 WARN("Could not find a string object.");
666 #ifndef PDF_PARSE_STRICT
668 parse_pdf_tainted_dict (const char **pp, const char *endptr)
672 parser_state.tainted = 1;
673 result = parse_pdf_dict(pp, endptr, NULL);
674 parser_state.tainted = 0;
678 #else /* PDF_PARSE_STRICT */
680 parse_pdf_tainted_dict (const char **pp, const char *endptr)
682 return parse_pdf_dict(pp, endptr, NULL);
684 #endif /* !PDF_PARSE_STRICT */
687 parse_pdf_dict (const char **pp, const char *endptr, pdf_file *pf)
689 pdf_obj *result = NULL;
694 skip_white(&p, endptr);
696 /* At least four letter <<>>. */
697 if (p + 4 > endptr ||
698 p[0] != '<' || p[1] != '<') {
703 result = pdf_new_dict();
705 skip_white(&p, endptr);
706 while (p < endptr && p[0] != '>') {
707 pdf_obj *key, *value;
709 skip_white(&p, endptr);
710 key = parse_pdf_name(&p, endptr);
712 WARN("Could not find a key in dictionary object.");
713 pdf_release_obj(result);
717 skip_white(&p, endptr);
719 value = parse_pdf_object(&p, endptr, pf);
721 pdf_release_obj(key);
722 pdf_release_obj(value);
723 pdf_release_obj(result);
724 WARN("Could not find a value in dictionary object.");
727 pdf_add_dict(result, key, value);
729 skip_white(&p, endptr);
732 if (p + 2 > endptr ||
733 p[0] != '>' || p[1] != '>') {
734 WARN("Syntax error: Dictionary object ended prematurely.");
735 pdf_release_obj(result);
739 *pp = p + 2; /* skip >> */
744 parse_pdf_array (const char **pp, const char *endptr, pdf_file *pf)
751 skip_white(&p, endptr);
752 if (p + 2 > endptr || p[0] != '[') {
753 WARN("Could not find an array object.");
757 result = pdf_new_array();
760 skip_white(&p, endptr);
762 while (p < endptr && p[0] != ']') {
765 elem = parse_pdf_object(&p, endptr, pf);
767 pdf_release_obj(result);
768 WARN("Could not find a valid object in array object.");
771 pdf_add_array(result, elem);
773 skip_white(&p, endptr);
776 if (p >= endptr || p[0] != ']') {
777 WARN("Array object ended prematurely.");
778 pdf_release_obj(result);
782 *pp = p + 1; /* skip ] */
787 parse_pdf_stream (const char **pp, const char *endptr, pdf_obj *dict, pdf_file *pf)
789 pdf_obj *result = NULL;
791 pdf_obj *stream_dict;
795 skip_white(&p, endptr);
796 if (p + 6 > endptr ||
797 strncmp(p, "stream", 6)) {
802 /* The keyword stream that follows the stream dictionary
803 * should be followed by an end-of-line marker consisting of
804 * either a carriage return (0x0D;\r) and a line feed (0x0A;\n)
805 * or just a line feed, and not by a carriage return alone.
806 * [PDF Reference, 6th ed., version 1.7, pp. 60-61] */
808 /* Notice that TeX translates an end-of-line marker to a single space. */
809 if (p < endptr && p[0] == '\n') {
811 } else if (p + 1 < endptr &&
812 (p[0] == '\r' && p[1] == '\n')) {
820 tmp = pdf_lookup_dict(dict, "Length");
823 tmp2 = pdf_deref_obj(tmp);
824 if (pdf_obj_typeof(tmp2) != PDF_NUMBER)
827 stream_length = (long) pdf_number_value(tmp2);
829 pdf_release_obj(tmp2);
837 if (stream_length < 0 ||
838 p + stream_length > endptr)
842 * If Filter is not applied, set STREAM_COMPRESS flag.
843 * Should we use filter for ASCIIHexEncode/ASCII85Encode-ed streams?
848 filters = pdf_lookup_dict(dict, "Filter");
849 if (!filters && stream_length > 10) {
850 result = pdf_new_stream(STREAM_COMPRESS);
852 result = pdf_new_stream(0);
856 stream_dict = pdf_stream_dict(result);
857 pdf_merge_dict(stream_dict, dict);
859 pdf_add_stream(result, p, stream_length);
862 /* Check "endsteam" */
864 /* It is recommended that there be an end-of-line marker
865 * after the data and before endstream; this marker is not included
866 * in the stream length.
867 * [PDF Reference, 6th ed., version 1.7, pp. 61] */
868 if (p < endptr && p[0] == '\r')
870 if (p < endptr && p[0] == '\n')
873 if (p + 9 > endptr ||
874 memcmp(p, "endstream", 9)) {
875 pdf_release_obj(result);
885 #ifndef PDF_PARSE_STRICT
887 /* PLEASE REMOVE THIS */
888 #include "specials.h"
890 /* This is not PDF indirect reference. */
892 parse_pdf_reference (const char **start, const char *end)
894 pdf_obj *result = NULL;
899 skip_white(start, end);
900 name = parse_opt_ident(start, end);
902 result = spc_lookup_reference(name);
904 WARN("Could not find the named reference (@%s).", name);
905 DUMP_RESTORE(*start, end);
909 WARN("Could not find a reference name.");
910 DUMP_RESTORE(*start, end);
916 #endif /* !PDF_PARSE_STRICT */
919 try_pdf_reference (const char *start, const char *end, const char **endptr, pdf_file *pf)
921 unsigned long id = 0;
922 unsigned short gen = 0;
929 skip_white(&start, end);
930 if (start > end - 5 || !isdigit(*start)) {
933 while (!is_space(*start)) {
934 if (start >= end || !isdigit(*start)) {
937 id = id * 10 + (*start - '0');
941 skip_white(&start, end);
942 if (start >= end || !isdigit(*start))
944 while (!is_space(*start)) {
945 if (start >= end || !isdigit(*start))
947 gen = gen * 10 + (*start - '0');
951 skip_white(&start, end);
952 if (start >= end || *start != 'R')
955 if (!PDF_TOKEN_END(start, end))
961 return pdf_new_indirect(pf, id, gen);
965 parse_pdf_object (const char **pp, const char *endptr, pdf_file *pf)
966 /* If pf is NULL, then indirect references are not allowed */
968 pdf_obj *result = NULL;
971 skip_white(pp, endptr);
973 WARN("Could not find any valid object.");
981 if (*(*pp + 1) != '<') {
982 result = parse_pdf_hex_string(pp, endptr);
986 result = parse_pdf_dict(pp, endptr, pf);
987 skip_white(pp, endptr);
989 *pp <= endptr - 15 &&
990 !memcmp(*pp, "stream", 6)) {
992 result = parse_pdf_stream(pp, endptr, dict, pf);
993 pdf_release_obj(dict);
999 result = parse_pdf_string(pp, endptr);
1002 result = parse_pdf_array(pp, endptr, pf);
1005 result = parse_pdf_name(pp, endptr);
1008 result = parse_pdf_null(pp, endptr);
1011 result = parse_pdf_boolean(pp, endptr);
1013 case '+': case '-': case '.':
1014 result = parse_pdf_number(pp, endptr);
1016 case '0': case '1': case '2': case '3': case '4':
1017 case '5': case '6': case '7': case '8': case '9':
1020 * If pf != NULL, then we are parsing a PDF file,
1021 * and indirect references are allowed.
1023 if (pf && (result = try_pdf_reference(*pp, endptr, &nextptr, pf))) {
1026 result = parse_pdf_number(pp, endptr);
1032 #ifndef PDF_PARSE_STRICT
1033 result = parse_pdf_reference(pp, endptr);
1034 #endif /* !PDF_PARSE_STRICT */
1038 WARN("Unknown PDF object type.");