1 ###############################################################################
3 # Class: NaturalDocs::LineReader
5 ###############################################################################
7 # An object to handle reading text files line by line in a cross platform manner. Using this class instead of the standard
8 # angle brackets approach has the following benefits:
10 # - It strips all three types of line breaks automatically: CR/LF (Windows) LF (Unix) and CR (Classic Mac). You do not need to
11 # call chomp(). Perl's chomp() fails when parsing Windows-format line breaks on a Unix platform anyway. It leaves the /r on,
12 # which screws everything up.
13 # - It reads Classic Mac files line by line correctly, whereas the Perl version returns it all as one line.
14 # - It abstracts away ignoring the Unicode BOM on the first line, if present.
16 ###############################################################################
18 # This file is part of Natural Docs, which is Copyright © 2003-2010 Greg Valure
19 # Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL)
20 # Refer to License.txt for the complete details
28 package NaturalDocs::LineReader;
33 # LINEREADER_FILEHANDLE - The file handle being used to read the file. Has the LINEREADER_ prefix to make sure it doesn't
34 # conflict with any actual filehandles named FILEHANDLE in the program.
35 # CACHED_LINES - An arrayref of lines already read into memory.
37 use NaturalDocs::DefineMembers 'LINEREADER_FILEHANDLE',
43 # Creates and returns a new object.
47 # filehandle - The file handle being used to read the file.
51 my ($selfPackage, $filehandle) = @_;
55 $object->[LINEREADER_FILEHANDLE] = $filehandle;
56 $object->[CACHED_LINES] = [ ];
58 binmode($filehandle, ':raw');
61 my $possibleBOM = undef;
62 read($filehandle, $possibleBOM, 2);
64 if ($possibleBOM eq "\xEF\xBB")
66 read($filehandle, $possibleBOM, 1);
67 if ($possibleBOM eq "\xBF")
69 binmode($filehandle, ':crlf:encoding(UTF-8)'); # Strict UTF-8, not Perl's lax version.
73 elsif ($possibleBOM eq "\xFE\xFF")
75 binmode($filehandle, ':crlf:encoding(UTF-16BE)');
78 elsif ($possibleBOM eq "\xFF\xFE")
80 binmode($filehandle, ':crlf:encoding(UTF-16LE)');
86 seek($filehandle, 0, 0);
89 my $readLength = -s $filehandle;
91 # Since we're only reading the data to determine if it's UTF-8, sanity check the file length. We may run
92 # across a huge extensionless system file and we don't want to load the whole thing. Half a meg should
93 # be good enough to encompass giant source files while not bogging things down on system files.
94 if ($readLength > 512 * 1024)
95 { $readLength = 512 * 1024; }
97 read($filehandle, $rawData, $readLength);
100 { $rawData = Encode::decode("UTF-8", $rawData, Encode::FB_CROAK); };
103 { binmode($filehandle, ':crlf'); }
106 # Theoretically, since this is valid UTF-8 data we should be able to split it on line breaks and feed them into
107 # CACHED_LINES instead of setting the encoding to UTF-8 and seeking back to zero just to read it all again.
108 # Alas, this doesn't work for an easily identifiable reason. I'm sure there is one, but I couldn't figure it out
109 # before my patience ran out so I'm just letting the file cache absorb the hit instead. If we were ever to do
110 # this in the future you'd have to handle the file length capping code above too.
111 binmode($filehandle, ':crlf:encoding(UTF-8)');
114 seek($filehandle, 0, 0);
117 bless $object, $selfPackage;
125 # Removes any line breaks from the end of a value. It does not remove any that are in the middle of it.
129 # lineRef - A *reference* to the line to chomp.
133 my ($self, $lineRef) = @_;
134 $$lineRef =~ s/(?:\r\n|\r|\n)$//;
141 # Returns the next line of text from the file, or undef if there are no more. The line break will be removed automatically. If
142 # the first line contains a Unicode BOM, that will also be removed automatically.
149 if (scalar @{$self->[CACHED_LINES]} == 0)
151 my $filehandle = $self->[LINEREADER_FILEHANDLE];
152 my $rawLine = <$filehandle>;
154 if (!defined $rawLine)
157 $self->Chomp(\$rawLine);
159 if ($rawLine =~ /\r/)
161 push @{$self->[CACHED_LINES]}, split(/\r/, $rawLine); # Split for Classic Mac
162 $line = shift @{$self->[CACHED_LINES]};
165 { $line = $rawLine; }
168 { $line = shift @{$self->[CACHED_LINES]}; }
177 # Returns an array of all the lines from the file. The line breaks will be removed automatically. If the first line contains a
178 # Unicode BOM, that will also be removed automatically.
184 my $filehandle = $self->[LINEREADER_FILEHANDLE];
187 read($filehandle, $rawContent, -s $filehandle);
189 return split(/\r\n|\n|\r/, $rawContent);