2 extract headers from HTML files
3 in format suitable for turning into permuted index
12 maximum sizes for input line and for name in <a> tag
19 all return 0 for OK, 1 for errors
21 int do_file( char *, FILE * ) ;
22 int parse_line( char * ) ;
23 int print_line( char *, char *) ;
24 int print_header_problem( char * ) ;
33 int main(int argc, char* argv[])
36 int temp, done, status ;
46 if( isdigit(p[1]) && p[2] == '\0' ) {
47 max_level = p[1] - 0 ;
51 else die("unknown option") ;
56 if( (status = do_file("STDIN", stdin)) == 0 )
61 printf("ARGC = %d\n", argc ) ;
66 printf("ARGV P %s %s\n", *argv, p) ;
69 fprintf(stderr, "%s: null filename pointer\n", prog_name) ;
72 else if( (fp = fopen(p,"r")) == NULL ) {
73 fprintf(stderr, "%s: cannot open file %s\n", prog_name, p) ;
77 if( (temp = do_file(p, fp)) != 0 )
87 printf("%s: %d files processed, %d with errors\n", prog_name, done, status) ;
89 return( status ? 1 : 0 ) ;
92 void die( char *message )
95 fprintf(stderr, "%s: %s\n", prog_name, message) ;
99 int header_flags[10] ;
102 char buffer[MAX_LINE+1] ;
103 char label[MAX_NAME+1] ;
105 int do_file( char *file, FILE *fp )
107 int i, status, x, y ;
113 for( i = 0 ; i < 10 ; i++ )
114 header_flags[i] = 0 ;
115 current_file = file ;
117 while( base = fgets(buffer, MAX_LINE, fp) ) {
118 // count < and > characters in line
119 for( x = y = 0, p = base ; *p ; p++ )
130 // skip line if no < or >
131 if( x == 0 && y == 0 )
133 // report error for unequal count
135 if( strncmp( base, "<!--", 4) && strncmp(base, "-->", 3) ) {
137 fprintf(stderr, "%s in file %s: unequal < > counts %d %d\n",
138 prog_name, file, x, y ) ;
139 fprintf(stderr, "%s: %s\n", prog_name, base) ;
145 // parse lines containing tags
147 if( parse_line(base) )
149 // check that header labelling is sane
150 for( i = x = y = 0 ; i < 10 ; i++ ) {
151 // count non-zero entries
152 if( x = header_flags[i] )
154 // should be in 0 or 1 headers at a time
164 int parse_line( char *data )
170 for( end = data ; *end ; end++ )
172 // trim off trailing returns or newlines
173 for( p = end - 1, q = end ; q > data ; p--,q-- ) {
180 break ; // out of switch()
182 break ; // out of for()
187 // find tag delimiters
189 for( q = p + 1 ; *q ; q++ )
190 if( *q == '<' || *q == '>' )
192 // if we find another '<'
193 // restart tag search from it
198 // "<>" is not interesting
201 fprintf(stderr, "%s: null tag\n", prog_name) ;
202 fprintf(stderr, "%s: line\n", prog_name, data) ;
207 // ignore delimiters once found
210 // p points to tag contents, null terminated
212 // save contents of <a name= > tags
216 (p[2] == 'n' || p[2] == 'N') &&
217 (p[3] == 'a' || p[3] == 'A') &&
218 (p[4] == 'm' || p[4] == 'M') &&
219 (p[5] == 'e' || p[5] == 'E') &&
221 strncpy(label, p + 7, MAX_NAME) ;
225 if( in_header && strlen(p) == 2 &&
226 (p[1] == 'r' || p[1] == 'R') )
232 if( strlen(p) == 2 && isdigit(p[1]) ) {
234 fprintf(stderr, "%s: bad header nesting in %s\n",
235 prog_name, current_file) ;
239 printf("%s\t%s\tH%d\t", current_file, label, x) ;
242 // only care about end-of-header
248 if( strlen(p) == 2 && isdigit(p[1]) ) {
250 fprintf(stderr, "%s: bad header nesting in %s\n",
251 prog_name, current_file) ;
260 // uninteresting tag, look for next
264 // tag done, point p beyond it
267 else if( in_header ) {
268 if( isprint(*p) && *p != '\n' )
280 int print_line( char *tag, char *text)
282 printf("%%s\ts\t%s\t%s\t\n", current_file, label, tag, text) ;
286 int print_header_problem( char *file )
290 fprintf(stderr, "%s: HEADER TAG PROBLEM in file %s\n", prog_name, file) ;
291 fprintf(stderr, "%s: counts", prog_name) ;
292 for ( i = 0 ; i < 10 ; i++ )
293 fprintf(stderr, "\t%d", i) ;
294 fprintf(stderr,"\n") ;