1 /* tidylib.c -- internal library definitions
3 (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4 See tidy.h for the copyright notice.
6 Defines HTML Tidy API implemented by tidy library.
8 Very rough initial cut for discussion purposes.
10 Public interface is const-correct and doesn't explicitly depend
11 on any globals. Thus, thread-safety may be introduced w/out
12 changing the interface.
14 Looking ahead to a C++ wrapper, C functions always pass
15 this-equivalent as 1st arg.
17 Created 2001-05-20 by Charles Reitzel
35 #ifdef TIDY_WIN32_MLANG_SUPPORT
38 #if !defined(NDEBUG) && defined(_MSC_VER)
42 /* Create/Destroy a Tidy "document" object */
43 static TidyDocImpl* tidyDocCreate( TidyAllocator *allocator );
44 static void tidyDocRelease( TidyDocImpl* impl );
46 static int tidyDocStatus( TidyDocImpl* impl );
49 static int tidyDocParseFile( TidyDocImpl* impl, ctmbstr htmlfil );
50 static int tidyDocParseStdin( TidyDocImpl* impl );
51 static int tidyDocParseString( TidyDocImpl* impl, ctmbstr content );
52 static int tidyDocParseBuffer( TidyDocImpl* impl, TidyBuffer* inbuf );
53 static int tidyDocParseSource( TidyDocImpl* impl, TidyInputSource* docIn );
56 /* Execute post-parse diagnostics and cleanup.
57 ** Note, the order is important. You will get different
58 ** results from the diagnostics depending on if they are run
59 ** pre-or-post repair.
61 static int tidyDocRunDiagnostics( TidyDocImpl* doc );
62 static void tidyDocReportDoctype( TidyDocImpl* doc );
63 static int tidyDocCleanAndRepair( TidyDocImpl* doc );
66 /* Save cleaned up file to file/buffer/sink */
67 static int tidyDocSaveFile( TidyDocImpl* impl, ctmbstr htmlfil );
68 static int tidyDocSaveStdout( TidyDocImpl* impl );
69 static int tidyDocSaveString( TidyDocImpl* impl, tmbstr buffer, uint* buflen );
70 static int tidyDocSaveBuffer( TidyDocImpl* impl, TidyBuffer* outbuf );
71 static int tidyDocSaveSink( TidyDocImpl* impl, TidyOutputSink* docOut );
72 static int tidyDocSaveStream( TidyDocImpl* impl, StreamOut* out );
75 TidyDocImpl* tidyDocToImpl( TidyDoc tdoc )
77 return (TidyDocImpl*) tdoc;
79 TidyDoc tidyImplToDoc( TidyDocImpl* impl )
81 return (TidyDoc) impl;
84 Node* tidyNodeToImpl( TidyNode tnod )
88 TidyNode tidyImplToNode( Node* node )
90 return (TidyNode) node;
93 AttVal* tidyAttrToImpl( TidyAttr tattr )
95 return (AttVal*) tattr;
97 TidyAttr tidyImplToAttr( AttVal* attval )
99 return (TidyAttr) attval;
102 const TidyOptionImpl* tidyOptionToImpl( TidyOption topt )
104 return (const TidyOptionImpl*) topt;
106 TidyOption tidyImplToOption( const TidyOptionImpl* option )
108 return (TidyOption) option;
112 /* Tidy public interface
114 ** Most functions return an integer:
122 TidyDoc TIDY_CALL tidyCreate(void)
124 TidyDocImpl* impl = tidyDocCreate( &TY_(g_default_allocator) );
125 return tidyImplToDoc( impl );
128 TidyDoc TIDY_CALL tidyCreateWithAllocator( TidyAllocator *allocator )
130 TidyDocImpl* impl = tidyDocCreate( allocator );
131 return tidyImplToDoc( impl );
134 void TIDY_CALL tidyRelease( TidyDoc tdoc )
136 TidyDocImpl* impl = tidyDocToImpl( tdoc );
137 tidyDocRelease( impl );
140 TidyDocImpl* tidyDocCreate( TidyAllocator *allocator )
142 TidyDocImpl* doc = (TidyDocImpl*)TidyAlloc( allocator, sizeof(TidyDocImpl) );
143 TidyClearMemory( doc, sizeof(*doc) );
144 doc->allocator = allocator;
147 TY_(InitTags)( doc );
148 TY_(InitAttrs)( doc );
149 TY_(InitConfig)( doc );
150 TY_(InitPrintBuf)( doc );
152 /* By default, wire tidy messages to standard error.
153 ** Document input will be set by parsing routines.
154 ** Document output will be set by pretty print routines.
155 ** Config input will be set by config parsing routines.
156 ** But we need to start off with a way to report errors.
158 doc->errout = TY_(StdErrOutput)();
162 void tidyDocRelease( TidyDocImpl* doc )
164 /* doc in/out opened and closed by parse/print routines */
167 assert( doc->docIn == NULL );
168 assert( doc->docOut == NULL );
170 TY_(ReleaseStreamOut)( doc, doc->errout );
173 TY_(FreePrintBuf)( doc );
174 TY_(FreeNode)(doc, &doc->root);
175 TidyClearMemory(&doc->root, sizeof(Node));
177 if (doc->givenDoctype)
178 TidyDocFree(doc, doc->givenDoctype);
180 TY_(FreeConfig)( doc );
181 TY_(FreeAttrTable)( doc );
182 TY_(FreeTags)( doc );
184 * Issue #186 - Now FreeNode depend on the doctype, so the lexer is needed
185 * to determine which hash is to be used, so free it last.
187 TY_(FreeLexer)( doc );
188 TidyDocFree( doc, doc );
192 /* Let application store a chunk of data w/ each Tidy tdocance.
193 ** Useful for callbacks.
195 void TIDY_CALL tidySetAppData( TidyDoc tdoc, void* appData )
197 TidyDocImpl* impl = tidyDocToImpl( tdoc );
199 impl->appData = appData;
201 void* TIDY_CALL tidyGetAppData( TidyDoc tdoc )
203 TidyDocImpl* impl = tidyDocToImpl( tdoc );
205 return impl->appData;
209 ctmbstr TIDY_CALL tidyReleaseDate(void)
211 return TY_(ReleaseDate)();
215 /* Get/set configuration options
217 Bool TIDY_CALL tidySetOptionCallback( TidyDoc tdoc, TidyOptCallback pOptCallback )
219 TidyDocImpl* impl = tidyDocToImpl( tdoc );
222 impl->pOptCallback = pOptCallback;
229 int TIDY_CALL tidyLoadConfig( TidyDoc tdoc, ctmbstr cfgfil )
231 TidyDocImpl* impl = tidyDocToImpl( tdoc );
233 return TY_(ParseConfigFile)( impl, cfgfil );
237 int TIDY_CALL tidyLoadConfigEnc( TidyDoc tdoc, ctmbstr cfgfil, ctmbstr charenc )
239 TidyDocImpl* impl = tidyDocToImpl( tdoc );
241 return TY_(ParseConfigFileEnc)( impl, cfgfil, charenc );
245 int TIDY_CALL tidySetCharEncoding( TidyDoc tdoc, ctmbstr encnam )
247 TidyDocImpl* impl = tidyDocToImpl( tdoc );
250 int enc = TY_(CharEncodingId)( impl, encnam );
251 if ( enc >= 0 && TY_(AdjustCharEncoding)(impl, enc) )
254 TY_(ReportBadArgument)( impl, "char-encoding" );
259 int TIDY_CALL tidySetInCharEncoding( TidyDoc tdoc, ctmbstr encnam )
261 TidyDocImpl* impl = tidyDocToImpl( tdoc );
264 int enc = TY_(CharEncodingId)( impl, encnam );
265 if ( enc >= 0 && TY_(SetOptionInt)( impl, TidyInCharEncoding, enc ) )
268 TY_(ReportBadArgument)( impl, "in-char-encoding" );
273 int TIDY_CALL tidySetOutCharEncoding( TidyDoc tdoc, ctmbstr encnam )
275 TidyDocImpl* impl = tidyDocToImpl( tdoc );
278 int enc = TY_(CharEncodingId)( impl, encnam );
279 if ( enc >= 0 && TY_(SetOptionInt)( impl, TidyOutCharEncoding, enc ) )
282 TY_(ReportBadArgument)( impl, "out-char-encoding" );
287 TidyOptionId TIDY_CALL tidyOptGetIdForName( ctmbstr optnam )
289 const TidyOptionImpl* option = TY_(lookupOption)( optnam );
292 return N_TIDY_OPTIONS; /* Error */
295 TidyIterator TIDY_CALL tidyGetOptionList( TidyDoc tdoc )
297 TidyDocImpl* impl = tidyDocToImpl( tdoc );
299 return TY_(getOptionList)( impl );
300 return (TidyIterator) -1;
303 TidyOption TIDY_CALL tidyGetNextOption( TidyDoc tdoc, TidyIterator* pos )
305 TidyDocImpl* impl = tidyDocToImpl( tdoc );
306 const TidyOptionImpl* option = NULL;
308 option = TY_(getNextOption)( impl, pos );
311 return tidyImplToOption( option );
315 TidyOption TIDY_CALL tidyGetOption( TidyDoc ARG_UNUSED(tdoc), TidyOptionId optId )
317 const TidyOptionImpl* option = TY_(getOption)( optId );
318 return tidyImplToOption( option );
320 TidyOption TIDY_CALL tidyGetOptionByName( TidyDoc ARG_UNUSED(doc), ctmbstr optnam )
322 const TidyOptionImpl* option = TY_(lookupOption)( optnam );
323 return tidyImplToOption( option );
326 TidyOptionId TIDY_CALL tidyOptGetId( TidyOption topt )
328 const TidyOptionImpl* option = tidyOptionToImpl( topt );
331 return N_TIDY_OPTIONS;
333 ctmbstr TIDY_CALL tidyOptGetName( TidyOption topt )
335 const TidyOptionImpl* option = tidyOptionToImpl( topt );
340 TidyOptionType TIDY_CALL tidyOptGetType( TidyOption topt )
342 const TidyOptionImpl* option = tidyOptionToImpl( topt );
345 return (TidyOptionType) -1;
347 TidyConfigCategory TIDY_CALL tidyOptGetCategory( TidyOption topt )
349 const TidyOptionImpl* option = tidyOptionToImpl( topt );
351 return option->category;
352 return (TidyConfigCategory) -1;
354 ctmbstr TIDY_CALL tidyOptGetDefault( TidyOption topt )
356 const TidyOptionImpl* option = tidyOptionToImpl( topt );
357 if ( option && option->type == TidyString )
358 return (ctmbstr) option->dflt;
361 ulong TIDY_CALL tidyOptGetDefaultInt( TidyOption topt )
363 const TidyOptionImpl* option = tidyOptionToImpl( topt );
364 if ( option && option->type != TidyString )
368 Bool TIDY_CALL tidyOptGetDefaultBool( TidyOption topt )
370 const TidyOptionImpl* option = tidyOptionToImpl( topt );
371 if ( option && option->type != TidyString )
372 return ( option->dflt ? yes : no );
375 Bool TIDY_CALL tidyOptIsReadOnly( TidyOption topt )
377 const TidyOptionImpl* option = tidyOptionToImpl( topt );
379 return ( option->parser == NULL );
384 TidyIterator TIDY_CALL tidyOptGetPickList( TidyOption topt )
386 const TidyOptionImpl* option = tidyOptionToImpl( topt );
388 return TY_(getOptionPickList)( option );
389 return (TidyIterator) -1;
391 ctmbstr TIDY_CALL tidyOptGetNextPick( TidyOption topt, TidyIterator* pos )
393 const TidyOptionImpl* option = tidyOptionToImpl( topt );
395 return TY_(getNextOptionPick)( option, pos );
400 ctmbstr TIDY_CALL tidyOptGetValue( TidyDoc tdoc, TidyOptionId optId )
402 TidyDocImpl* impl = tidyDocToImpl( tdoc );
403 ctmbstr optval = NULL;
405 optval = cfgStr( impl, optId );
408 Bool TIDY_CALL tidyOptSetValue( TidyDoc tdoc, TidyOptionId optId, ctmbstr val )
410 TidyDocImpl* impl = tidyDocToImpl( tdoc );
412 return TY_(ParseConfigValue)( impl, optId, val );
415 Bool TIDY_CALL tidyOptParseValue( TidyDoc tdoc, ctmbstr optnam, ctmbstr val )
417 TidyDocImpl* impl = tidyDocToImpl( tdoc );
419 return TY_(ParseConfigOption)( impl, optnam, val );
423 ulong TIDY_CALL tidyOptGetInt( TidyDoc tdoc, TidyOptionId optId )
425 TidyDocImpl* impl = tidyDocToImpl( tdoc );
428 opti = cfg( impl, optId );
432 Bool TIDY_CALL tidyOptSetInt( TidyDoc tdoc, TidyOptionId optId, ulong val )
434 TidyDocImpl* impl = tidyDocToImpl( tdoc );
436 return TY_(SetOptionInt)( impl, optId, val );
440 Bool TIDY_CALL tidyOptGetBool( TidyDoc tdoc, TidyOptionId optId )
442 TidyDocImpl* impl = tidyDocToImpl( tdoc );
446 const TidyOptionImpl* option = TY_(getOption)( optId );
449 optb = cfgBool( impl, optId );
455 Bool TIDY_CALL tidyOptSetBool( TidyDoc tdoc, TidyOptionId optId, Bool val )
457 TidyDocImpl* impl = tidyDocToImpl( tdoc );
459 return TY_(SetOptionBool)( impl, optId, val );
463 ctmbstr TIDY_CALL tidyOptGetEncName( TidyDoc tdoc, TidyOptionId optId )
465 uint enc = tidyOptGetInt( tdoc, optId );
466 return TY_(CharEncodingOptName)( enc );
469 ctmbstr TIDY_CALL tidyOptGetCurrPick( TidyDoc tdoc, TidyOptionId optId )
471 const TidyOptionImpl* option = TY_(getOption)( optId );
472 if ( option && option->pickList )
474 uint ix, pick = tidyOptGetInt( tdoc, optId );
475 const ctmbstr* pL = option->pickList;
476 for ( ix=0; *pL && ix < pick; ++ix )
485 TidyIterator TIDY_CALL tidyOptGetDeclTagList( TidyDoc tdoc )
487 TidyDocImpl* impl = tidyDocToImpl( tdoc );
488 TidyIterator declIter = 0;
490 declIter = TY_(GetDeclaredTagList)( impl );
494 ctmbstr TIDY_CALL tidyOptGetNextDeclTag( TidyDoc tdoc, TidyOptionId optId,
497 TidyDocImpl* impl = tidyDocToImpl( tdoc );
498 ctmbstr tagnam = NULL;
501 UserTagType tagtyp = tagtype_null;
502 if ( optId == TidyInlineTags )
503 tagtyp = tagtype_inline;
504 else if ( optId == TidyBlockTags )
505 tagtyp = tagtype_block;
506 else if ( optId == TidyEmptyTags )
507 tagtyp = tagtype_empty;
508 else if ( optId == TidyPreTags )
509 tagtyp = tagtype_pre;
510 if ( tagtyp != tagtype_null )
511 tagnam = TY_(GetNextDeclaredTag)( impl, tagtyp, iter );
516 ctmbstr TIDY_CALL tidyOptGetDoc( TidyDoc ARG_UNUSED(tdoc), TidyOption opt )
518 const TidyOptionId optId = tidyOptGetId( opt );
519 const TidyOptionDoc* docDesc = TY_(OptGetDocDesc)( optId );
520 return docDesc ? docDesc->doc : NULL;
523 TidyIterator TIDY_CALL tidyOptGetDocLinksList( TidyDoc ARG_UNUSED(tdoc), TidyOption opt )
525 const TidyOptionId optId = tidyOptGetId( opt );
526 const TidyOptionDoc* docDesc = TY_(OptGetDocDesc)( optId );
527 if (docDesc && docDesc->links)
528 return (TidyIterator)docDesc->links;
529 return (TidyIterator)NULL;
532 TidyOption TIDY_CALL tidyOptGetNextDocLinks( TidyDoc tdoc, TidyIterator* pos )
534 const TidyOptionId* curr = (const TidyOptionId *)*pos;
537 if (*curr == TidyUnknownOption)
539 *pos = (TidyIterator)NULL;
540 return (TidyOption)0;
542 opt = tidyGetOption(tdoc, *curr);
544 *pos = (*curr == TidyUnknownOption ) ?
545 (TidyIterator)NULL:(TidyIterator)curr;
549 int TIDY_CALL tidyOptSaveFile( TidyDoc tdoc, ctmbstr cfgfil )
551 TidyDocImpl* impl = tidyDocToImpl( tdoc );
553 return TY_(SaveConfigFile)( impl, cfgfil );
557 int TIDY_CALL tidyOptSaveSink( TidyDoc tdoc, TidyOutputSink* sink )
559 TidyDocImpl* impl = tidyDocToImpl( tdoc );
561 return TY_(SaveConfigSink)( impl, sink );
565 Bool TIDY_CALL tidyOptSnapshot( TidyDoc tdoc )
567 TidyDocImpl* impl = tidyDocToImpl( tdoc );
570 TY_(TakeConfigSnapshot)( impl );
575 Bool TIDY_CALL tidyOptResetToSnapshot( TidyDoc tdoc )
577 TidyDocImpl* impl = tidyDocToImpl( tdoc );
580 TY_(ResetConfigToSnapshot)( impl );
585 Bool TIDY_CALL tidyOptResetAllToDefault( TidyDoc tdoc )
587 TidyDocImpl* impl = tidyDocToImpl( tdoc );
590 TY_(ResetConfigToDefault)( impl );
596 Bool TIDY_CALL tidyOptResetToDefault( TidyDoc tdoc, TidyOptionId optId )
598 TidyDocImpl* impl = tidyDocToImpl( tdoc );
600 return TY_(ResetOptionToDefault)( impl, optId );
604 Bool TIDY_CALL tidyOptDiffThanDefault( TidyDoc tdoc )
606 TidyDocImpl* impl = tidyDocToImpl( tdoc );
608 return TY_(ConfigDiffThanDefault)( impl );
611 Bool TIDY_CALL tidyOptDiffThanSnapshot( TidyDoc tdoc )
613 TidyDocImpl* impl = tidyDocToImpl( tdoc );
615 return TY_(ConfigDiffThanSnapshot)( impl );
619 Bool TIDY_CALL tidyOptCopyConfig( TidyDoc to, TidyDoc from )
621 TidyDocImpl* docTo = tidyDocToImpl( to );
622 TidyDocImpl* docFrom = tidyDocToImpl( from );
623 if ( docTo && docFrom )
625 TY_(CopyConfig)( docTo, docFrom );
632 /* I/O and Message handling interface
634 ** By default, Tidy will define, create and use
635 ** tdocances of input and output handlers for
636 ** standard C buffered I/O (i.e. FILE* stdin,
637 ** FILE* stdout and FILE* stderr for content
638 ** input, content output and diagnostic output,
639 ** respectively. A FILE* cfgFile input handler
640 ** will be used for config files. Command line
641 ** options will just be set directly.
644 /* Use TidyReportFilter to filter messages by diagnostic level:
645 ** info, warning, etc. Just set diagnostic output
646 ** handler to redirect all diagnostics output. Return true
647 ** to proceed with output, false to cancel.
649 Bool TIDY_CALL tidySetReportFilter( TidyDoc tdoc, TidyReportFilter filt )
651 TidyDocImpl* impl = tidyDocToImpl( tdoc );
654 impl->mssgFilt = filt;
660 Bool TIDY_CALL tidySetReportFilter2( TidyDoc tdoc, TidyReportFilter2 filt )
662 TidyDocImpl* impl = tidyDocToImpl( tdoc );
665 impl->mssgFilt2 = filt;
672 int tidySetContentOutputSink( TidyDoc tdoc, TidyOutputSink* outp )
674 TidyDocImpl* impl = tidyDocToImpl( tdoc );
682 int tidySetDiagnosticOutputSink( TidyDoc tdoc, TidyOutputSink* outp )
684 TidyDocImpl* impl = tidyDocToImpl( tdoc );
696 cmbstr tidyLookupMessage( TidyDoc tdoc, int errorNo )
698 TidyDocImpl* impl = tidyDocToImpl( tdoc );
701 mssg = tidyMessage_Lookup( impl->messages, errorNo );
707 FILE* TIDY_CALL tidySetErrorFile( TidyDoc tdoc, ctmbstr errfilnam )
709 TidyDocImpl* impl = tidyDocToImpl( tdoc );
712 FILE* errout = fopen( errfilnam, "wb" );
715 uint outenc = cfg( impl, TidyOutCharEncoding );
716 uint nl = cfg( impl, TidyNewline );
717 TY_(ReleaseStreamOut)( impl, impl->errout );
718 impl->errout = TY_(FileOutput)( impl, errout, outenc, nl );
721 else /* Emit message to current error sink */
722 TY_(FileError)( impl, errfilnam, TidyError );
727 int TIDY_CALL tidySetErrorBuffer( TidyDoc tdoc, TidyBuffer* errbuf )
729 TidyDocImpl* impl = tidyDocToImpl( tdoc );
732 uint outenc = cfg( impl, TidyOutCharEncoding );
733 uint nl = cfg( impl, TidyNewline );
734 TY_(ReleaseStreamOut)( impl, impl->errout );
735 impl->errout = TY_(BufferOutput)( impl, errbuf, outenc, nl );
736 return ( impl->errout ? 0 : -ENOMEM );
741 int TIDY_CALL tidySetErrorSink( TidyDoc tdoc, TidyOutputSink* sink )
743 TidyDocImpl* impl = tidyDocToImpl( tdoc );
746 uint outenc = cfg( impl, TidyOutCharEncoding );
747 uint nl = cfg( impl, TidyNewline );
748 TY_(ReleaseStreamOut)( impl, impl->errout );
749 impl->errout = TY_(UserOutput)( impl, sink, outenc, nl );
750 return ( impl->errout ? 0 : -ENOMEM );
757 int TIDY_CALL tidyStatus( TidyDoc tdoc )
759 TidyDocImpl* impl = tidyDocToImpl( tdoc );
760 int tidyStat = -EINVAL;
762 tidyStat = tidyDocStatus( impl );
765 int TIDY_CALL tidyDetectedHtmlVersion( TidyDoc ARG_UNUSED(tdoc) )
767 /* TidyDocImpl* impl = tidyDocToImpl( tdoc ); */
770 Bool TIDY_CALL tidyDetectedXhtml( TidyDoc ARG_UNUSED(tdoc) )
772 /* TidyDocImpl* impl = tidyDocToImpl( tdoc ); */
775 Bool TIDY_CALL tidyDetectedGenericXml( TidyDoc ARG_UNUSED(tdoc) )
777 /* TidyDocImpl* impl = tidyDocToImpl( tdoc ); */
781 uint TIDY_CALL tidyErrorCount( TidyDoc tdoc )
783 TidyDocImpl* impl = tidyDocToImpl( tdoc );
784 uint count = 0xFFFFFFFF;
786 count = impl->errors;
789 uint TIDY_CALL tidyWarningCount( TidyDoc tdoc )
791 TidyDocImpl* impl = tidyDocToImpl( tdoc );
792 uint count = 0xFFFFFFFF;
794 count = impl->warnings;
797 uint TIDY_CALL tidyAccessWarningCount( TidyDoc tdoc )
799 TidyDocImpl* impl = tidyDocToImpl( tdoc );
800 uint count = 0xFFFFFFFF;
802 count = impl->accessErrors;
805 uint TIDY_CALL tidyConfigErrorCount( TidyDoc tdoc )
807 TidyDocImpl* impl = tidyDocToImpl( tdoc );
808 uint count = 0xFFFFFFFF;
810 count = impl->optionErrors;
815 /* Error reporting functions
817 void TIDY_CALL tidyErrorSummary( TidyDoc tdoc )
819 TidyDocImpl* impl = tidyDocToImpl( tdoc );
821 TY_(ErrorSummary)( impl );
823 void TIDY_CALL tidyGeneralInfo( TidyDoc tdoc )
825 TidyDocImpl* impl = tidyDocToImpl( tdoc );
827 TY_(GeneralInfo)( impl );
833 ** Initial version supports only whole-file operations.
834 ** Do not expose Tidy StreamIn or Out data structures - yet.
837 /* Parse/load Functions
839 ** HTML/XHTML version determined from input.
841 int TIDY_CALL tidyParseFile( TidyDoc tdoc, ctmbstr filnam )
843 TidyDocImpl* doc = tidyDocToImpl( tdoc );
844 return tidyDocParseFile( doc, filnam );
846 int TIDY_CALL tidyParseStdin( TidyDoc tdoc )
848 TidyDocImpl* doc = tidyDocToImpl( tdoc );
849 return tidyDocParseStdin( doc );
851 int TIDY_CALL tidyParseString( TidyDoc tdoc, ctmbstr content )
853 TidyDocImpl* doc = tidyDocToImpl( tdoc );
854 return tidyDocParseString( doc, content );
856 int TIDY_CALL tidyParseBuffer( TidyDoc tdoc, TidyBuffer* inbuf )
858 TidyDocImpl* doc = tidyDocToImpl( tdoc );
859 return tidyDocParseBuffer( doc, inbuf );
861 int TIDY_CALL tidyParseSource( TidyDoc tdoc, TidyInputSource* source )
863 TidyDocImpl* doc = tidyDocToImpl( tdoc );
864 return tidyDocParseSource( doc, source );
868 int tidyDocParseFile( TidyDocImpl* doc, ctmbstr filnam )
871 return TY_(DocParseFileWithMappedFile)( doc, filnam );
873 int status = -ENOENT;
874 FILE* fin = fopen( filnam, "rb" );
876 #if PRESERVE_FILE_TIMES
877 struct stat sbuf = {0};
878 /* get last modified time */
879 TidyClearMemory( &doc->filetimes, sizeof(doc->filetimes) );
880 if ( fin && cfgBool(doc,TidyKeepFileTimes) &&
881 fstat(fileno(fin), &sbuf) != -1 )
883 doc->filetimes.actime = sbuf.st_atime;
884 doc->filetimes.modtime = sbuf.st_mtime;
890 StreamIn* in = TY_(FileInput)( doc, fin, cfg( doc, TidyInCharEncoding ));
896 status = TY_(DocParseStream)( doc, in );
897 TY_(freeFileSource)(&in->source, yes);
898 TY_(freeStreamIn)(in);
900 else /* Error message! */
901 TY_(FileError)( doc, filnam, TidyError );
906 int tidyDocParseStdin( TidyDocImpl* doc )
908 StreamIn* in = TY_(FileInput)( doc, stdin, cfg( doc, TidyInCharEncoding ));
909 int status = TY_(DocParseStream)( doc, in );
910 TY_(freeStreamIn)(in);
914 int tidyDocParseBuffer( TidyDocImpl* doc, TidyBuffer* inbuf )
916 int status = -EINVAL;
919 StreamIn* in = TY_(BufferInput)( doc, inbuf, cfg( doc, TidyInCharEncoding ));
920 status = TY_(DocParseStream)( doc, in );
921 TY_(freeStreamIn)(in);
926 int tidyDocParseString( TidyDocImpl* doc, ctmbstr content )
928 int status = -EINVAL;
934 tidyBufInitWithAllocator( &inbuf, doc->allocator );
935 tidyBufAttach( &inbuf, (byte*)content, TY_(tmbstrlen)(content)+1 );
936 in = TY_(BufferInput)( doc, &inbuf, cfg( doc, TidyInCharEncoding ));
937 status = TY_(DocParseStream)( doc, in );
938 tidyBufDetach( &inbuf );
939 TY_(freeStreamIn)(in);
944 int tidyDocParseSource( TidyDocImpl* doc, TidyInputSource* source )
946 StreamIn* in = TY_(UserInput)( doc, source, cfg( doc, TidyInCharEncoding ));
947 int status = TY_(DocParseStream)( doc, in );
948 TY_(freeStreamIn)(in);
953 /* Print/save Functions
956 int TIDY_CALL tidySaveFile( TidyDoc tdoc, ctmbstr filnam )
958 TidyDocImpl* doc = tidyDocToImpl( tdoc );
959 return tidyDocSaveFile( doc, filnam );
961 int TIDY_CALL tidySaveStdout( TidyDoc tdoc )
963 TidyDocImpl* doc = tidyDocToImpl( tdoc );
964 return tidyDocSaveStdout( doc );
966 int TIDY_CALL tidySaveString( TidyDoc tdoc, tmbstr buffer, uint* buflen )
968 TidyDocImpl* doc = tidyDocToImpl( tdoc );
969 return tidyDocSaveString( doc, buffer, buflen );
971 int TIDY_CALL tidySaveBuffer( TidyDoc tdoc, TidyBuffer* outbuf )
973 TidyDocImpl* doc = tidyDocToImpl( tdoc );
974 return tidyDocSaveBuffer( doc, outbuf );
976 int TIDY_CALL tidySaveSink( TidyDoc tdoc, TidyOutputSink* sink )
978 TidyDocImpl* doc = tidyDocToImpl( tdoc );
979 return tidyDocSaveSink( doc, sink );
982 int tidyDocSaveFile( TidyDocImpl* doc, ctmbstr filnam )
984 int status = -ENOENT;
987 /* Don't zap input file if no output */
988 if ( doc->errors > 0 &&
989 cfgBool(doc, TidyWriteBack) && !cfgBool(doc, TidyForceOutput) )
990 status = tidyDocStatus( doc );
992 fout = fopen( filnam, "wb" );
996 uint outenc = cfg( doc, TidyOutCharEncoding );
997 uint nl = cfg( doc, TidyNewline );
998 StreamOut* out = TY_(FileOutput)( doc, fout, outenc, nl );
1000 status = tidyDocSaveStream( doc, out );
1003 TidyDocFree( doc, out );
1005 #if PRESERVE_FILE_TIMES
1006 if ( doc->filetimes.actime )
1008 /* set file last accessed/modified times to original values */
1009 utime( filnam, &doc->filetimes );
1010 TidyClearMemory( &doc->filetimes, sizeof(doc->filetimes) );
1012 #endif /* PRESERVFILETIMES */
1014 if ( status < 0 ) /* Error message! */
1015 TY_(FileError)( doc, filnam, TidyError );
1021 /* Note, _setmode() does NOT work on Win2K Pro w/ VC++ 6.0 SP3.
1022 ** The code has been left in in case it works w/ other compilers
1023 ** or operating systems. If stdout is in Text mode, be aware that
1024 ** it will garble UTF16 documents. In text mode, when it encounters
1025 ** a single byte of value 10 (0xA), it will insert a single byte
1026 ** value 13 (0xD) just before it. This has the effect of garbling
1027 ** the entire document.
1030 #if !defined(NO_SETMODE_SUPPORT)
1032 #if defined(_WIN32) || defined(OS2_OS)
1039 int tidyDocSaveStdout( TidyDocImpl* doc )
1041 #if !defined(NO_SETMODE_SUPPORT)
1043 #if defined(_WIN32) || defined(OS2_OS)
1044 int oldstdoutmode = -1, oldstderrmode = -1;
1049 uint outenc = cfg( doc, TidyOutCharEncoding );
1050 uint nl = cfg( doc, TidyNewline );
1051 StreamOut* out = TY_(FileOutput)( doc, stdout, outenc, nl );
1053 #if !defined(NO_SETMODE_SUPPORT)
1055 #if defined(_WIN32) || defined(OS2_OS)
1056 oldstdoutmode = setmode( fileno(stdout), _O_BINARY );
1057 oldstderrmode = setmode( fileno(stderr), _O_BINARY );
1063 status = tidyDocSaveStream( doc, out );
1068 #if !defined(NO_SETMODE_SUPPORT)
1070 #if defined(_WIN32) || defined(OS2_OS)
1071 if ( oldstdoutmode != -1 )
1072 oldstdoutmode = setmode( fileno(stdout), oldstdoutmode );
1073 if ( oldstderrmode != -1 )
1074 oldstderrmode = setmode( fileno(stderr), oldstderrmode );
1079 TidyDocFree( doc, out );
1083 int tidyDocSaveString( TidyDocImpl* doc, tmbstr buffer, uint* buflen )
1085 uint outenc = cfg( doc, TidyOutCharEncoding );
1086 uint nl = cfg( doc, TidyNewline );
1091 tidyBufInitWithAllocator( &outbuf, doc->allocator );
1092 out = TY_(BufferOutput)( doc, &outbuf, outenc, nl );
1093 status = tidyDocSaveStream( doc, out );
1095 if ( outbuf.size > *buflen )
1098 memcpy( buffer, outbuf.bp, outbuf.size );
1100 *buflen = outbuf.size;
1101 tidyBufFree( &outbuf );
1102 TidyDocFree( doc, out );
1106 int tidyDocSaveBuffer( TidyDocImpl* doc, TidyBuffer* outbuf )
1108 int status = -EINVAL;
1111 uint outenc = cfg( doc, TidyOutCharEncoding );
1112 uint nl = cfg( doc, TidyNewline );
1113 StreamOut* out = TY_(BufferOutput)( doc, outbuf, outenc, nl );
1115 status = tidyDocSaveStream( doc, out );
1116 TidyDocFree( doc, out );
1121 int tidyDocSaveSink( TidyDocImpl* doc, TidyOutputSink* sink )
1123 uint outenc = cfg( doc, TidyOutCharEncoding );
1124 uint nl = cfg( doc, TidyNewline );
1125 StreamOut* out = TY_(UserOutput)( doc, sink, outenc, nl );
1126 int status = tidyDocSaveStream( doc, out );
1127 TidyDocFree( doc, out );
1131 int tidyDocStatus( TidyDocImpl* doc )
1133 if ( doc->errors > 0 )
1135 if ( doc->warnings > 0 || doc->accessErrors > 0 )
1142 int TIDY_CALL tidyCleanAndRepair( TidyDoc tdoc )
1144 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1146 return tidyDocCleanAndRepair( impl );
1150 int TIDY_CALL tidyRunDiagnostics( TidyDoc tdoc )
1152 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1154 return tidyDocRunDiagnostics( impl );
1158 int TIDY_CALL tidyReportDoctype( TidyDoc tdoc )
1161 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1163 tidyDocReportDoctype( impl );
1169 /* Workhorse functions.
1171 ** Parse requires input source, all input config items
1172 ** and diagnostic sink to have all been set before calling.
1174 ** Emit likewise requires that document sink and all
1175 ** pretty printing options have been set.
1177 static ctmbstr integrity = "\nPanic - tree has lost its integrity\n";
1179 int TY_(DocParseStream)( TidyDocImpl* doc, StreamIn* in )
1181 Bool xmlIn = cfgBool( doc, TidyXmlTags );
1184 assert( doc != NULL && in != NULL );
1185 assert( doc->docIn == NULL );
1188 TY_(TakeConfigSnapshot)( doc ); /* Save config state */
1189 TY_(FreeAnchors)( doc );
1191 TY_(FreeNode)(doc, &doc->root);
1192 TidyClearMemory(&doc->root, sizeof(Node));
1194 if (doc->givenDoctype)
1195 TidyDocFree(doc, doc->givenDoctype);
1197 * Issue #186 - Now FreeNode depend on the doctype, so the lexer is needed
1198 * to determine which hash is to be used, so free it last.
1200 TY_(FreeLexer)( doc );
1201 doc->givenDoctype = NULL;
1203 doc->lexer = TY_(NewLexer)( doc );
1204 /* doc->lexer->root = &doc->root; */
1205 doc->root.line = doc->lexer->lines;
1206 doc->root.column = doc->lexer->columns;
1207 doc->inputHadBOM = no;
1209 bomEnc = TY_(ReadBOMEncoding)(in);
1213 in->encoding = bomEnc;
1214 TY_(SetOptionInt)(doc, TidyInCharEncoding, bomEnc);
1217 #ifdef TIDY_WIN32_MLANG_SUPPORT
1218 if (in->encoding > WIN32MLANG)
1219 TY_(Win32MLangInitInputTranscoder)(in, in->encoding);
1220 #endif /* TIDY_WIN32_MLANG_SUPPORT */
1222 /* Tidy doesn't alter the doctype for generic XML docs */
1225 TY_(ParseXMLDocument)( doc );
1226 if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
1227 TidyPanic( doc->allocator, integrity );
1232 TY_(ParseDocument)( doc );
1233 if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
1234 TidyPanic( doc->allocator, integrity );
1237 #ifdef TIDY_WIN32_MLANG_SUPPORT
1238 TY_(Win32MLangUninitInputTranscoder)(in);
1239 #endif /* TIDY_WIN32_MLANG_SUPPORT */
1242 return tidyDocStatus( doc );
1245 int tidyDocRunDiagnostics( TidyDocImpl* doc )
1247 Bool quiet = cfgBool( doc, TidyQuiet );
1248 Bool force = cfgBool( doc, TidyForceOutput );
1253 TY_(ReportMarkupVersion)( doc );
1254 TY_(ReportNumWarnings)( doc );
1257 if ( doc->errors > 0 && !force )
1258 TY_(NeedsAuthorIntervention)( doc );
1260 return tidyDocStatus( doc );
1263 void tidyDocReportDoctype( TidyDocImpl* doc )
1265 TY_(ReportMarkupVersion)( doc );
1269 /* ######################################################################################
1272 #if !defined(NDEBUG) && defined(_MSC_VER)
1273 extern void show_not_html5(void);
1274 /* -----------------------------
1275 List tags that do not have version HTML5 (HT50|XH50)
1277 acronym applet basefont big center dir font frame frameset isindex
1278 listing noframes plaintext rb rbc rtc strike tt xmp nextid
1279 align bgsound blink comment ilayer layer marquee multicol nobr noembed
1280 nolayer nosave server servlet spacer
1282 Listed total 35 tags that do not have version 393216
1283 ------------------------------ */
1285 static void list_not_html5(void)
1287 static Bool done_list = no;
1288 if (done_list == no) {
1295 /* What about <blink>, <s> stike-through, <u> underline */
1296 static struct _html5Info
1300 } const html5Info[] = {
1301 {"acronym", TidyTag_ACRONYM},
1302 {"applet", TidyTag_APPLET },
1303 {"basefont",TidyTag_BASEFONT },
1304 { "big", TidyTag_BIG },
1305 { "center", TidyTag_CENTER },
1306 { "dir", TidyTag_DIR },
1307 { "font", TidyTag_FONT },
1308 { "frame", TidyTag_FRAME},
1309 { "frameset", TidyTag_FRAMESET},
1310 { "noframes", TidyTag_NOFRAMES },
1311 { "strike", TidyTag_STRIKE },
1312 { "tt", TidyTag_TT },
1315 Bool inRemovedInfo( uint tid )
1318 for (i = 0; ; i++) {
1319 if (html5Info[i].tag == 0)
1321 if (html5Info[i].id == tid)
1327 static Bool BadBody5( Node* node )
1329 if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
1330 TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
1331 TY_(AttrGetById)(node, TidyAttr_TEXT) ||
1332 TY_(AttrGetById)(node, TidyAttr_LINK) ||
1333 TY_(AttrGetById)(node, TidyAttr_VLINK) ||
1334 TY_(AttrGetById)(node, TidyAttr_ALINK))
1341 static Bool nodeHasAlignAttr( Node *node )
1343 /* #define attrIsALIGN(av) AttrIsId( av, TidyAttr_ALIGN ) */
1345 for ( av = node->attributes; av != NULL; av = av->next ) {
1346 if (attrIsALIGN(av))
1352 /* see http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#obsolete */
1354 void TY_(CheckHTML5)( TidyDocImpl* doc, Node* node )
1356 /* Lexer* lexer = doc->lexer; */
1357 Bool clean = cfgBool( doc, TidyMakeClean );
1358 Node* body = TY_(FindBody)( doc );
1359 Bool warn = yes; /* should this be a warning, error, or report??? */
1360 #if !defined(NDEBUG) && defined(_MSC_VER)
1361 // list_not_html5();
1365 if ( nodeHasAlignAttr( node ) ) {
1367 * Is this for ALL elements that accept an 'align' attribute, or should
1368 * this be a sub-set test
1370 TY_(ReportWarning)(doc, node, node, BAD_ALIGN_HTML5);
1372 if ( node == body ) {
1373 if ( BadBody5(body) ) {
1374 /* perhaps need a new/different warning for this, like
1375 * The background 'attribute" on the body element is obsolete. Use CSS instead.
1376 * but how to pass an attribute name to be embedded in the message.
1378 TY_(ReportWarning)(doc, node, body, BAD_BODY_HTML5);
1381 if ( nodeIsACRONYM(node) ) {
1383 /* replace with 'abbr' with warning to that effect
1384 * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
1386 TY_(CoerceNode)(doc, node, TidyTag_ABBR, warn, no);
1388 /* sadly, this stops writing of the tidied document, unless 'forced'
1389 TY_(ReportError)(doc, node, node, REMOVED_HTML5);
1390 so go back to a 'warning' for now...
1392 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1395 if ( nodeIsAPPLET(node) ) {
1397 /* replace with 'object' with warning to that effect
1398 * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
1400 TY_(CoerceNode)(doc, node, TidyTag_OBJECT, warn, no);
1402 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1405 if ( nodeIsBASEFONT(node) ) {
1407 * basefont: CSS equivalen 'font-size', 'font-family' and 'color' on body or class on each subsequent element
1408 * Difficult - If it is the first body element, then could consider adding that
1409 * to the <body> as a whole, else could perhaps apply it to all subsequent element.
1410 * But also in consideration is the fact that it was NOT supported in many browsers
1411 * For now just report a warning
1413 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1415 if ( nodeIsBIG(node) ) {
1417 * big: CSS equivalent 'font-size:larger'
1418 * so could replace the <big> ... </big> with
1419 * <span style="font-size: larger"> ... </span>
1420 * then replace <big> with <span>
1421 * Need to think about that...
1423 * TY_(AddStyleProperty)( doc, node, "font-size: larger" );
1424 * TY_(CoerceNode)(doc, node, TidyTag_SPAN, no, no);
1425 * Alternatively generated a <style> but how to get the style name
1426 * TY_(AddAttribute)( doc, node, "class", "????" );
1427 * Also maybe need a specific message like
1428 * Element '%s' replaced with 'span' with a 'font-size: larger style attribute
1429 * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
1433 TY_(AddStyleProperty)( doc, node, "font-size: larger" );
1434 TY_(CoerceNode)(doc, node, TidyTag_SPAN, warn, no);
1436 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1439 if ( nodeIsCENTER(node) ) {
1441 * center: CSS equivalent 'text-align:center'
1442 * and 'margin-left:auto; margin-right:auto' on descendant blocks
1443 * Tidy already handles this if 'clean' by SILENTLY generating the <style>
1444 * and adding a <div class="c1"> around the elements.
1445 * see: static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1447 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1449 if ( nodeIsDIR(node) ) {
1451 * dir: replace by <ul>
1452 * Tidy already actions this and issues a warning
1453 * Should this be CHANGED???
1455 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1457 if ( nodeIsFONT(node) ) {
1459 * Tidy already handles this -
1460 * If 'clean' replaced by CSS, else
1461 * if is NOT clean, and doctype html5 then warnings issued
1462 * done in Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode ) (I think?)
1465 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1467 if (( nodesIsFRAME(node) ) || ( nodeIsFRAMESET(node) ) || ( nodeIsNOFRAMES(node) )) {
1469 * YOW: What to do here?????? Maybe <iframe>????
1471 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1473 if ( nodeIsSTRIKE(node) ) {
1475 * strike: CSS equivalent 'text-decoration:line-through'
1476 * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
1479 TY_(AddStyleProperty)( doc, node, "text-decoration: line-through" );
1480 TY_(CoerceNode)(doc, node, TidyTag_SPAN, warn, no);
1482 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1485 if ( nodeIsTT(node) ) {
1487 * tt: CSS equivalent 'font-family:monospace'
1488 * Tidy presently does nothing. Tidy5 issues a warning
1489 * But like the 'clean' <font> replacement this could also be replaced with CSS
1490 * maybe should use static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
1494 TY_(AddStyleProperty)( doc, node, "font-family: monospace" );
1495 TY_(CoerceNode)(doc, node, TidyTag_SPAN, warn, no);
1497 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1500 if (TY_(nodeIsElement)(node)) {
1502 if ((!(node->tag->versions & VERS_HTML5))||(inRemovedInfo(node->tag->id))) {
1503 /* issue warning for elements like 'markquee' */
1504 TY_(ReportWarning)(doc, node, node, REMOVED_HTML5);
1510 TY_(CheckHTML5)( doc, node->content );
1516 ######################################################################################
1519 #if !defined(NDEBUG) && defined(_MSC_VER)
1520 /* *** FOR DEBUG ONLY *** */
1521 const char *dbg_get_lexer_type( void *vp )
1523 Node *node = (Node *)vp;
1524 switch ( node->type )
1526 case RootNode: return "Root";
1527 case DocTypeTag: return "DocType";
1528 case CommentTag: return "Comment";
1529 case ProcInsTag: return "ProcIns";
1530 case TextNode: return "Text";
1531 case StartTag: return "StartTag";
1532 case EndTag: return "EndTag";
1533 case StartEndTag: return "StartEnd";
1534 case CDATATag: return "CDATA";
1535 case SectionTag: return "Section";
1536 case AspTag: return "Asp";
1537 case JsteTag: return "Jste";
1538 case PhpTag: return "Php";
1539 case XmlDecl: return "XmlDecl";
1544 /* NOTE: THis matches the above lexer type, except when element has a name */
1545 const char *dbg_get_element_name( void *vp )
1547 Node *node = (Node *)vp;
1548 switch ( node->type )
1550 case TidyNode_Root: return "Root";
1551 case TidyNode_DocType: return "DocType";
1552 case TidyNode_Comment: return "Comment";
1553 case TidyNode_ProcIns: return "ProcIns";
1554 case TidyNode_Text: return "Text";
1555 case TidyNode_CDATA: return "CDATA";
1556 case TidyNode_Section: return "Section";
1557 case TidyNode_Asp: return "Asp";
1558 case TidyNode_Jste: return "Jste";
1559 case TidyNode_Php: return "Php";
1560 case TidyNode_XmlDecl: return "XmlDecl";
1562 case TidyNode_Start:
1564 case TidyNode_StartEnd:
1567 return node->element;
1572 void dbg_show_node( TidyDocImpl* doc, Node *node, int caller, int indent )
1576 ctmbstr name = dbg_get_element_name(node);
1577 ctmbstr type = dbg_get_lexer_type(node);
1578 ctmbstr impl = node->implicit ? "implicit" : "";
1581 case 1: call = "discard"; break;
1582 case 2: call = "trim"; break;
1583 case 3: call = "test"; break;
1587 if (strcmp(type,name))
1588 SPRTF("%s %s %s %s", type, name, impl, call );
1590 SPRTF("%s %s %s", name, impl, call );
1591 for (av = node->attributes; av; av = av->next) {
1592 name = av->attribute;
1596 SPRTF("=\"%s\"", av->value);
1603 void dbg_show_all_nodes( TidyDocImpl* doc, Node *node, int indent )
1607 dbg_show_node( doc, node, 0, indent );
1608 dbg_show_all_nodes( doc, node->content, indent + 1 );
1615 int tidyDocCleanAndRepair( TidyDocImpl* doc )
1617 Bool word2K = cfgBool( doc, TidyWord2000 );
1618 Bool logical = cfgBool( doc, TidyLogicalEmphasis );
1619 Bool clean = cfgBool( doc, TidyMakeClean );
1620 Bool gdoc = cfgBool( doc, TidyGDocClean );
1621 Bool dropFont = cfgBool( doc, TidyDropFontTags );
1622 Bool htmlOut = cfgBool( doc, TidyHtmlOut );
1623 Bool xmlOut = cfgBool( doc, TidyXmlOut );
1624 Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
1625 Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
1626 Bool tidyMark = cfgBool( doc, TidyMark );
1627 Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
1628 Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
1629 Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
1630 ctmbstr sdef = NULL;
1633 #if !defined(NDEBUG) && defined(_MSC_VER)
1634 SPRTF("All nodes BEFORE clean and repair\n");
1635 dbg_show_all_nodes( doc, &doc->root, 0 );
1638 return tidyDocStatus( doc );
1640 /* simplifies <b><b> ... </b> ...</b> etc. */
1641 if ( mergeEmphasis )
1642 TY_(NestedEmphasis)( doc, &doc->root );
1644 /* cleans up <dir>indented text</dir> etc. */
1645 TY_(List2BQ)( doc, &doc->root );
1646 TY_(BQ2Div)( doc, &doc->root );
1648 /* replaces i by em and b by strong */
1650 TY_(EmFromI)( doc, &doc->root );
1652 if ( word2K && TY_(IsWord2000)(doc) )
1654 /* prune Word2000's <![if ...]> ... <![endif]> */
1655 TY_(DropSections)( doc, &doc->root );
1657 /* drop style & class attributes and empty p, span elements */
1658 TY_(CleanWord2000)( doc, &doc->root );
1659 TY_(DropEmptyElements)(doc, &doc->root);
1662 /* replaces presentational markup by style rules */
1663 if ( clean || dropFont )
1664 TY_(CleanDocument)( doc );
1666 /* clean up html exported by Google Docs */
1668 TY_(CleanGoogleDocument)( doc );
1670 /* Move terminating <br /> tags from out of paragraphs */
1671 /*! Do we want to do this for all block-level elements? */
1673 /* This is disabled due to http://tidy.sf.net/bug/681116 */
1675 FixBrakes( doc, TY_(FindBody)( doc ));
1678 /* Reconcile http-equiv meta element with output encoding */
1679 if (cfg( doc, TidyOutCharEncoding) != RAW
1680 #ifndef NO_NATIVE_ISO2022_SUPPORT
1681 && cfg( doc, TidyOutCharEncoding) != ISO2022
1684 TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
1686 if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
1687 TidyPanic( doc->allocator, integrity );
1689 /* remember given doctype for reporting */
1690 node = TY_(FindDocType)(doc);
1691 sdef = tidyOptGetValue((TidyDoc)doc, TidyDoctype );
1693 sdef = tidyOptGetCurrPick((TidyDoc) doc, TidyDoctypeMode );
1694 if (sdef && (strcmp(sdef,"html5") == 0)) {
1695 TY_(CheckHTML5)( doc, &doc->root );
1699 AttVal* fpi = TY_(GetAttrByName)(node, "PUBLIC");
1700 if (AttrHasValue(fpi))
1702 if (doc->givenDoctype)
1703 TidyDocFree(doc, doc->givenDoctype);
1704 doc->givenDoctype = TY_(tmbstrdup)(doc->allocator,fpi->value);
1708 if ( doc->root.content )
1710 /* If we had XHTML input but want HTML output */
1711 if ( htmlOut && doc->lexer->isvoyager )
1713 Node* node = TY_(FindDocType)(doc);
1714 /* Remove reference, but do not free */
1716 TY_(RemoveNode)(node);
1719 if (xhtmlOut && !htmlOut)
1721 TY_(SetXHTMLDocType)(doc);
1722 TY_(FixAnchors)(doc, &doc->root, wantNameAttr, yes);
1723 TY_(FixXhtmlNamespace)(doc, yes);
1724 TY_(FixLanguageInformation)(doc, &doc->root, yes, yes);
1728 TY_(FixDocType)(doc);
1729 TY_(FixAnchors)(doc, &doc->root, wantNameAttr, yes);
1730 TY_(FixXhtmlNamespace)(doc, no);
1731 TY_(FixLanguageInformation)(doc, &doc->root, no, yes);
1735 TY_(AddGenerator)(doc);
1738 /* ensure presence of initial <?xml version="1.0"?> */
1739 if ( xmlOut && xmlDecl )
1740 TY_(FixXmlDecl)( doc );
1742 #if !defined(NDEBUG) && defined(_MSC_VER)
1743 SPRTF("All nodes AFTER clean and repair\n");
1744 dbg_show_all_nodes( doc, &doc->root, 0 );
1746 return tidyDocStatus( doc );
1750 Bool showBodyOnly( TidyDocImpl* doc, TidyTriState bodyOnly )
1761 node = TY_(FindBody)( doc );
1762 if (node && node->implicit )
1769 int tidyDocSaveStream( TidyDocImpl* doc, StreamOut* out )
1771 Bool showMarkup = cfgBool( doc, TidyShowMarkup );
1772 Bool forceOutput = cfgBool( doc, TidyForceOutput );
1773 #if SUPPORT_UTF16_ENCODINGS
1774 Bool outputBOM = ( cfgAutoBool(doc, TidyOutputBOM) == TidyYesState );
1775 Bool smartBOM = ( cfgAutoBool(doc, TidyOutputBOM) == TidyAutoState );
1777 Bool xmlOut = cfgBool( doc, TidyXmlOut );
1778 Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
1779 TidyTriState bodyOnly = cfgAutoBool( doc, TidyBodyOnly );
1781 Bool dropComments = cfgBool(doc, TidyHideComments);
1782 Bool makeClean = cfgBool(doc, TidyMakeClean);
1783 Bool asciiChars = cfgBool(doc, TidyAsciiChars);
1784 Bool makeBare = cfgBool(doc, TidyMakeBare);
1785 Bool escapeCDATA = cfgBool(doc, TidyEscapeCdata);
1786 TidyAttrSortStrategy sortAttrStrat = cfg(doc, TidySortAttributes);
1789 TY_(ConvertCDATANodes)(doc, &doc->root);
1792 TY_(DropComments)(doc, &doc->root);
1797 TY_(DropFontElements)(doc, &doc->root, NULL);
1800 if ((makeClean && asciiChars) || makeBare)
1801 TY_(DowngradeTypography)(doc, &doc->root);
1804 /* Note: no longer replaces in */
1805 /* attribute values / non-text tokens */
1806 TY_(NormalizeSpaces)(doc->lexer, &doc->root);
1808 TY_(ReplacePreformattedSpaces)(doc, &doc->root);
1810 if ( sortAttrStrat != TidySortAttrNone )
1811 TY_(SortAttributes)(&doc->root, sortAttrStrat);
1813 if ( showMarkup && (doc->errors == 0 || forceOutput) )
1815 #if SUPPORT_UTF16_ENCODINGS
1816 /* Output a Byte Order Mark if required */
1817 if ( outputBOM || (doc->inputHadBOM && smartBOM) )
1821 /* No longer necessary. No DOCTYPE == HTML 3.2,
1822 ** which gives you only the basic character entities,
1823 ** which are safe in any browser.
1824 ** if ( !TY_(FindDocType)(doc) )
1825 ** TY_(SetOptionBool)( doc, TidyNumEntities, yes );
1829 if ( xmlOut && !xhtmlOut )
1830 TY_(PPrintXMLTree)( doc, NORMAL, 0, &doc->root );
1831 else if ( showBodyOnly( doc, bodyOnly ) )
1832 TY_(PrintBody)( doc );
1834 TY_(PPrintTree)( doc, NORMAL, 0, &doc->root );
1836 TY_(PFlushLine)( doc, 0 );
1840 TY_(ResetConfigToSnapshot)( doc );
1841 return tidyDocStatus( doc );
1844 /* Tree traversal functions
1846 ** The big issue here is the degree to which we should mimic
1847 ** a DOM and/or SAX nodes.
1849 ** Is it 100% possible (and, if so, how difficult is it) to
1850 ** emit SAX events from this API? If SAX events are possible,
1851 ** is that 100% of data needed to build a DOM?
1854 TidyNode TIDY_CALL tidyGetRoot( TidyDoc tdoc )
1856 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1860 return tidyImplToNode( node );
1863 TidyNode TIDY_CALL tidyGetHtml( TidyDoc tdoc )
1865 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1868 node = TY_(FindHTML)( impl );
1869 return tidyImplToNode( node );
1872 TidyNode TIDY_CALL tidyGetHead( TidyDoc tdoc )
1874 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1877 node = TY_(FindHEAD)( impl );
1878 return tidyImplToNode( node );
1881 TidyNode TIDY_CALL tidyGetBody( TidyDoc tdoc )
1883 TidyDocImpl* impl = tidyDocToImpl( tdoc );
1886 node = TY_(FindBody)( impl );
1887 return tidyImplToNode( node );
1890 /* parent / child */
1891 TidyNode TIDY_CALL tidyGetParent( TidyNode tnod )
1893 Node* nimp = tidyNodeToImpl( tnod );
1894 return tidyImplToNode( nimp->parent );
1896 TidyNode TIDY_CALL tidyGetChild( TidyNode tnod )
1898 Node* nimp = tidyNodeToImpl( tnod );
1899 return tidyImplToNode( nimp->content );
1903 TidyNode TIDY_CALL tidyGetNext( TidyNode tnod )
1905 Node* nimp = tidyNodeToImpl( tnod );
1906 return tidyImplToNode( nimp->next );
1908 TidyNode TIDY_CALL tidyGetPrev( TidyNode tnod )
1910 Node* nimp = tidyNodeToImpl( tnod );
1911 return tidyImplToNode( nimp->prev );
1915 TidyNodeType TIDY_CALL tidyNodeGetType( TidyNode tnod )
1917 Node* nimp = tidyNodeToImpl( tnod );
1918 TidyNodeType ntyp = TidyNode_Root;
1920 ntyp = (TidyNodeType) nimp->type;
1924 uint TIDY_CALL tidyNodeLine( TidyNode tnod )
1926 Node* nimp = tidyNodeToImpl( tnod );
1932 uint TIDY_CALL tidyNodeColumn( TidyNode tnod )
1934 Node* nimp = tidyNodeToImpl( tnod );
1941 ctmbstr TIDY_CALL tidyNodeGetName( TidyNode tnod )
1943 Node* nimp = tidyNodeToImpl( tnod );
1944 ctmbstr nnam = NULL;
1946 nnam = nimp->element;
1951 Bool TIDY_CALL tidyNodeHasText( TidyDoc tdoc, TidyNode tnod )
1953 TidyDocImpl* doc = tidyDocToImpl( tdoc );
1955 return TY_(nodeHasText)( doc, tidyNodeToImpl(tnod) );
1960 Bool TIDY_CALL tidyNodeGetText( TidyDoc tdoc, TidyNode tnod, TidyBuffer* outbuf )
1962 TidyDocImpl* doc = tidyDocToImpl( tdoc );
1963 Node* nimp = tidyNodeToImpl( tnod );
1964 if ( doc && nimp && outbuf )
1966 uint outenc = cfg( doc, TidyOutCharEncoding );
1967 uint nl = cfg( doc, TidyNewline );
1968 StreamOut* out = TY_(BufferOutput)( doc, outbuf, outenc, nl );
1969 Bool xmlOut = cfgBool( doc, TidyXmlOut );
1970 Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
1973 if ( xmlOut && !xhtmlOut )
1974 TY_(PPrintXMLTree)( doc, NORMAL, 0, nimp );
1976 TY_(PPrintTree)( doc, NORMAL, 0, nimp );
1978 TY_(PFlushLine)( doc, 0 );
1981 TidyDocFree( doc, out );
1987 Bool TIDY_CALL tidyNodeGetValue( TidyDoc tdoc, TidyNode tnod, TidyBuffer* buf )
1989 TidyDocImpl *doc = tidyDocToImpl( tdoc );
1990 Node *node = tidyNodeToImpl( tnod );
1991 if ( doc == NULL || node == NULL || buf == NULL )
1994 switch( node->type ) {
2004 tidyBufClear( buf );
2005 tidyBufAppend( buf, doc->lexer->lexbuf + node->start,
2006 node->end - node->start );
2010 /* The node doesn't have a value */
2017 Bool TIDY_CALL tidyNodeIsProp( TidyDoc ARG_UNUSED(tdoc), TidyNode tnod )
2019 Node* nimp = tidyNodeToImpl( tnod );
2020 Bool isProprietary = yes;
2023 switch ( nimp->type )
2039 isProprietary = yes;
2045 isProprietary = ( nimp->tag
2046 ? (nimp->tag->versions&VERS_PROPRIETARY)!=0
2051 return isProprietary;
2054 TidyTagId TIDY_CALL tidyNodeGetId(TidyNode tnod)
2056 Node* nimp = tidyNodeToImpl(tnod);
2058 TidyTagId tagId = TidyTag_UNKNOWN;
2059 if (nimp && nimp->tag)
2060 tagId = nimp->tag->id;
2066 /* Null for non-element nodes and all pure HTML
2067 cmbstr tidyNodeNsLocal( TidyNode tnod )
2070 cmbstr tidyNodeNsPrefix( TidyNode tnod )
2073 cmbstr tidyNodeNsUri( TidyNode tnod )
2078 /* Iterate over attribute values */
2079 TidyAttr TIDY_CALL tidyAttrFirst( TidyNode tnod )
2081 Node* nimp = tidyNodeToImpl( tnod );
2082 AttVal* attval = NULL;
2084 attval = nimp->attributes;
2085 return tidyImplToAttr( attval );
2087 TidyAttr TIDY_CALL tidyAttrNext( TidyAttr tattr )
2089 AttVal* attval = tidyAttrToImpl( tattr );
2090 AttVal* nxtval = NULL;
2092 nxtval = attval->next;
2093 return tidyImplToAttr( nxtval );
2096 ctmbstr TIDY_CALL tidyAttrName( TidyAttr tattr )
2098 AttVal* attval = tidyAttrToImpl( tattr );
2099 ctmbstr anam = NULL;
2101 anam = attval->attribute;
2104 ctmbstr TIDY_CALL tidyAttrValue( TidyAttr tattr )
2106 AttVal* attval = tidyAttrToImpl( tattr );
2107 ctmbstr aval = NULL;
2109 aval = attval->value;
2113 /* Null for pure HTML
2114 ctmbstr tidyAttrNsLocal( TidyAttr tattr )
2117 ctmbstr tidyAttrNsPrefix( TidyAttr tattr )
2120 ctmbstr tidyAttrNsUri( TidyAttr tattr )
2125 TidyAttrId TIDY_CALL tidyAttrGetId( TidyAttr tattr )
2127 AttVal* attval = tidyAttrToImpl( tattr );
2128 TidyAttrId attrId = TidyAttr_UNKNOWN;
2129 if ( attval && attval->dict )
2130 attrId = attval->dict->id;
2133 Bool TIDY_CALL tidyAttrIsProp( TidyAttr tattr )
2136 You cannot tell whether an attribute is proprietary without
2137 knowing on which element it occurs in the general case, but
2138 this function cannot know the element. As a result, it does
2139 not work anymore. Do not use.
2147 * indent-tabs-mode: nil
2149 * eval: (c-set-offset 'substatement-open 0)