GPL_bin_fullset/NaturalDocs/Modules/NaturalDocs/LineReader.pm

   1 ###############################################################################
   2 #
   3 #   Class: NaturalDocs::LineReader
   4 #
   5 ###############################################################################
   6 #
   7 #   An object to handle reading text files line by line in a cross platform manner.  Using this class instead of the standard
   8 #       angle brackets approach has the following benefits:
   9 #
  10 #       - It strips all three types of line breaks automatically: CR/LF (Windows) LF (Unix) and CR (Classic Mac).  You do not need to
  11 #         call chomp().  Perl's chomp() fails when parsing Windows-format line breaks on a Unix platform anyway.  It leaves the /r on,
  12 #         which screws everything up.
  13 #       - It reads Classic Mac files line by line correctly, whereas the Perl version returns it all as one line.
  14 #       - It abstracts away ignoring the Unicode BOM on the first line, if present.
  15 #
  16 ###############################################################################
  17
  18 # This file is part of Natural Docs, which is Copyright © 2003-2010 Greg Valure
  19 # Natural Docs is licensed under version 3 of the GNU Affero General Public License (AGPL)
  20 # Refer to License.txt for the complete details
  21
  22 use strict;
  23 use integer;
  24
  25 use Encode;
  26
  27
  28 package NaturalDocs::LineReader;
  29
  30 #
  31 #       Constants: Members
  32 #
  33 #       LINEREADER_FILEHANDLE - The file handle being used to read the file.  Has the LINEREADER_ prefix to make sure it doesn't
  34 #                                                                                        conflict with any actual filehandles named FILEHANDLE in the program.
  35 #       CACHED_LINES - An arrayref of lines already read into memory.
  36 #
  37 use NaturalDocs::DefineMembers 'LINEREADER_FILEHANDLE',
  38                                                  'CACHED_LINES';
  39
  40 #
  41 #   Function: New
  42 #
  43 #   Creates and returns a new object.
  44 #
  45 #   Parameters:
  46 #
  47 #       filehandle - The file handle being used to read the file.
  48 #
  49 sub New #(filehandle)
  50     {
  51     my ($selfPackage, $filehandle) = @_;
  52
  53     my $object = [ ];
  54
  55     $object->[LINEREADER_FILEHANDLE] = $filehandle;
  56     $object->[CACHED_LINES] = [ ];
  57
  58     binmode($filehandle, ':raw');
  59
  60         my $hasBOM = 0;
  61     my $possibleBOM = undef;
  62     read($filehandle, $possibleBOM, 2);
  63
  64     if ($possibleBOM eq "\xEF\xBB")
  65         {
  66         read($filehandle, $possibleBOM, 1);
  67         if ($possibleBOM eq "\xBF")
  68             {
  69             binmode($filehandle, ':crlf:encoding(UTF-8)');  # Strict UTF-8, not Perl's lax version.
  70                         $hasBOM = 1;
  71             }
  72         }
  73     elsif ($possibleBOM eq "\xFE\xFF")
  74         {
  75         binmode($filehandle, ':crlf:encoding(UTF-16BE)');
  76                 $hasBOM = 1;
  77         }
  78     elsif ($possibleBOM eq "\xFF\xFE")
  79         {
  80         binmode($filehandle, ':crlf:encoding(UTF-16LE)');
  81                 $hasBOM = 1;
  82         }
  83
  84         if (!$hasBOM)
  85         {
  86         seek($filehandle, 0, 0);
  87
  88                 my $rawData = undef;
  89                 my $readLength = -s $filehandle;
  90
  91                 # Since we're only reading the data to determine if it's UTF-8, sanity check the file length.  We may run
  92                 # across a huge extensionless system file and we don't want to load the whole thing.  Half a meg should
  93                 # be good enough to encompass giant source files while not bogging things down on system files.
  94                 if ($readLength > 512 * 1024)
  95                         {  $readLength = 512 * 1024;  }
  96
  97                 read($filehandle, $rawData, $readLength);
  98
  99                 eval
 100                         {  $rawData = Encode::decode("UTF-8", $rawData, Encode::FB_CROAK);  };
 101
 102                 if ($::EVAL_ERROR)
 103                         {  binmode($filehandle, ':crlf');  }
 104                 else
 105                         {
 106                         # Theoretically, since this is valid UTF-8 data we should be able to split it on line breaks and feed them into
 107                         # CACHED_LINES instead of setting the encoding to UTF-8 and seeking back to zero just to read it all again.
 108                         # Alas, this doesn't work for an easily identifiable reason.  I'm sure there is one, but I couldn't figure it out
 109                         # before my patience ran out so I'm just letting the file cache absorb the hit instead.  If we were ever to do
 110                         # this in the future you'd have to handle the file length capping code above too.
 111                         binmode($filehandle, ':crlf:encoding(UTF-8)');
 112                         }
 113
 114                 seek($filehandle, 0, 0);
 115                 }
 116
 117     bless $object, $selfPackage;
 118     return $object;
 119     };
 120
 121
 122 #
 123 #   Function: Chomp
 124 #
 125 #   Removes any line breaks from the end of a value.  It does not remove any that are in the middle of it.
 126 #
 127 #   Parameters:
 128 #
 129 #       lineRef - A *reference* to the line to chomp.
 130 #
 131 sub Chomp #(lineRef)
 132     {
 133     my ($self, $lineRef) = @_;
 134     $$lineRef =~ s/(?:\r\n|\r|\n)$//;
 135     };
 136
 137
 138 #
 139 #       Function: Get
 140 #
 141 #       Returns the next line of text from the file, or undef if there are no more.  The line break will be removed automatically.  If
 142 #       the first line contains a Unicode BOM, that will also be removed automatically.
 143 #
 144 sub Get
 145         {
 146         my $self = shift;
 147         my $line = undef;
 148
 149         if (scalar @{$self->[CACHED_LINES]} == 0)
 150                 {
 151                 my $filehandle = $self->[LINEREADER_FILEHANDLE];
 152                 my $rawLine = <$filehandle>;
 153
 154                 if (!defined $rawLine)
 155                         {  return undef;  }
 156
 157                 $self->Chomp(\$rawLine);
 158
 159         if ($rawLine =~ /\r/)
 160                 {
 161                         push @{$self->[CACHED_LINES]}, split(/\r/, $rawLine);  # Split for Classic Mac
 162                         $line = shift @{$self->[CACHED_LINES]};
 163                 }
 164         else
 165                 {  $line = $rawLine;  }
 166                 }
 167         else
 168                 {  $line = shift @{$self->[CACHED_LINES]};  }
 169
 170         return $line;
 171         }
 172
 173
 174 #
 175 #       Function: GetAll
 176 #
 177 #       Returns an array of all the lines from the file.  The line breaks will be removed automatically.  If the first line contains a
 178 #       Unicode BOM, that will also be removed automatically.
 179 #
 180 sub GetAll
 181         {
 182         my $self = shift;
 183
 184         my $filehandle = $self->[LINEREADER_FILEHANDLE];
 185         my $rawContent;
 186
 187     read($filehandle, $rawContent, -s $filehandle);
 188
 189     return split(/\r\n|\n|\r/, $rawContent);
 190         }
 191
 192 1;