redmine/vendor/plugins/coderay-0.7.6.227/lib/coderay/tokens.rb

   1 module CodeRay
   2
   3   # = Tokens
   4   #
   5   # The Tokens class represents a list of tokens returnd from
   6   # a Scanner.
   7   #
   8   # A token is not a special object, just a two-element Array
   9   # consisting of
  10   # * the _token_ _kind_ (a Symbol representing the type of the token)
  11   # * the _token_ _text_ (the original source of the token in a String)
  12   #
  13   # A token looks like this:
  14   #
  15   #   [:comment, '# It looks like this']
  16   #   [:float, '3.1415926']
  17   #   [:error, 'äöü']
  18   #
  19   # Some scanners also yield some kind of sub-tokens, represented by special
  20   # token texts, namely :open and :close .
  21   #
  22   # The Ruby scanner, for example, splits "a string" into:
  23   #
  24   #  [
  25   #   [:open, :string],
  26   #   [:delimiter, '"'],
  27   #   [:content, 'a string'],
  28   #   [:delimiter, '"'],
  29   #   [:close, :string]
  30   #  ]
  31   #
  32   # Tokens is also the interface between Scanners and Encoders:
  33   # The input is split and saved into a Tokens object. The Encoder
  34   # then builds the output from this object.
  35   #
  36   # Thus, the syntax below becomes clear:
  37   #
  38   #   CodeRay.scan('price = 2.59', :ruby).html
  39   #   # the Tokens object is here -------^
  40   #
  41   # See how small it is? ;)
  42   #
  43   # Tokens gives you the power to handle pre-scanned code very easily:
  44   # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
  45   # that you put in your DB.
  46   #
  47   # Tokens' subclass TokenStream allows streaming to save memory.
  48   class Tokens < Array
  49
  50     class << self
  51
  52       # Convert the token to a string.
  53       #
  54       # This format is used by Encoders.Tokens.
  55       # It can be reverted using read_token.
  56       def write_token text, type
  57         if text.is_a? String
  58           "#{type}\t#{escape(text)}\n"
  59         else
  60           ":#{text}\t#{type}\t\n"
  61         end
  62       end
  63
  64       # Read a token from the string.
  65       #
  66       # Inversion of write_token.
  67       #
  68       # TODO Test this!
  69       def read_token token
  70         type, text = token.split("\t", 2)
  71         if type[0] == ?:
  72           [text.to_sym, type[1..-1].to_sym]
  73         else
  74           [type.to_sym, unescape(text)]
  75         end
  76       end
  77
  78       # Escapes a string for use in write_token.
  79       def escape text
  80         text.gsub(/[\n\\]/, '\\\\\&')
  81       end
  82
  83       # Unescapes a string created by escape.
  84       def unescape text
  85         text.gsub(/\\[\n\\]/) { |m| m[1,1] }
  86       end
  87
  88     end
  89
  90     # Whether the object is a TokenStream.
  91     #
  92     # Returns false.
  93     def stream?
  94       false
  95     end
  96
  97     # Iterates over all tokens.
  98     #
  99     # If a filter is given, only tokens of that kind are yielded.
 100     def each kind_filter = nil, &block
 101       unless kind_filter
 102         super(&block)
 103       else
 104         super() do |text, kind|
 105           next unless kind == kind_filter
 106           yield text, kind
 107         end
 108       end
 109     end
 110
 111     # Iterates over all text tokens.
 112     # Range tokens like [:open, :string] are left out.
 113     #
 114     # Example:
 115     #   tokens.each_text_token { |text, kind| text.replace html_escape(text) }
 116     def each_text_token
 117       each do |text, kind|
 118         next unless text.is_a? ::String
 119         yield text, kind
 120       end
 121     end
 122
 123     # Encode the tokens using encoder.
 124     #
 125     # encoder can be
 126     # * a symbol like :html oder :statistic
 127     # * an Encoder class
 128     # * an Encoder object
 129     #
 130     # options are passed to the encoder.
 131     def encode encoder, options = {}
 132       unless encoder.is_a? Encoders::Encoder
 133         unless encoder.is_a? Class
 134           encoder_class = Encoders[encoder]
 135         end
 136         encoder = encoder_class.new options
 137       end
 138       encoder.encode_tokens self, options
 139     end
 140
 141
 142     # Turn into a string using Encoders::Text.
 143     #
 144     # +options+ are passed to the encoder if given.
 145     def to_s options = {}
 146       encode :text, options
 147     end
 148
 149
 150     # Redirects unknown methods to encoder calls.
 151     #
 152     # For example, if you call +tokens.html+, the HTML encoder
 153     # is used to highlight the tokens.
 154     def method_missing meth, options = {}
 155       Encoders[meth].new(options).encode_tokens self
 156     end
 157
 158     # Returns the tokens compressed by joining consecutive
 159     # tokens of the same kind.
 160     #
 161     # This can not be undone, but should yield the same output
 162     # in most Encoders.  It basically makes the output smaller.
 163     #
 164     # Combined with dump, it saves space for the cost of time.
 165     #
 166     # If the scanner is written carefully, this is not required -
 167     # for example, consecutive //-comment lines could already be
 168     # joined in one comment token by the Scanner.
 169     def optimize
 170       print ' Tokens#optimize: before: %d - ' % size if $DEBUG
 171       last_kind = last_text = nil
 172       new = self.class.new
 173       for text, kind in self
 174         if text.is_a? String
 175           if kind == last_kind
 176             last_text << text
 177           else
 178             new << [last_text, last_kind] if last_kind
 179             last_text = text
 180             last_kind = kind
 181           end
 182         else
 183           new << [last_text, last_kind] if last_kind
 184           last_kind = last_text = nil
 185           new << [text, kind]
 186         end
 187       end
 188       new << [last_text, last_kind] if last_kind
 189       print 'after: %d (%d saved = %2.0f%%)' %
 190         [new.size, size - new.size, 1.0 - (new.size.to_f / size)] if $DEBUG
 191       new
 192     end
 193
 194     # Compact the object itself; see optimize.
 195     def optimize!
 196       replace optimize
 197     end
 198
 199     # Ensure that all :open tokens have a correspondent :close one.
 200     #
 201     # TODO: Test this!
 202     def fix
 203       # Check token nesting using a stack of kinds.
 204       opened = []
 205       for token, kind in self
 206         if token == :open
 207           opened.push kind
 208         elsif token == :close
 209           expected = opened.pop
 210           if kind != expected
 211             # Unexpected :close; decide what to do based on the kind:
 212             # - token was opened earlier: also close tokens in between
 213             # - token was never opened: delete the :close (skip with next)
 214             next unless opened.rindex expected
 215             tokens << [:close, kind] until (kind = opened.pop) == expected
 216           end
 217         end
 218         tokens << [token, kind]
 219       end
 220       # Close remaining opened tokens
 221       tokens << [:close, kind] while kind = opened.pop
 222       tokens
 223     end
 224
 225     def fix!
 226       replace fix
 227     end
 228
 229     # Makes sure that:
 230     # - newlines are single tokens
 231     #   (which means all other token are single-line)
 232     # - there are no open tokens at the end the line
 233     #
 234     # This makes it simple for encoders that work line-oriented,
 235     # like HTML with list-style numeration.
 236     def split_into_lines
 237       raise NotImplementedError
 238     end
 239
 240     def split_into_lines!
 241       replace split_into_lines
 242     end
 243
 244     # Dumps the object into a String that can be saved
 245     # in files or databases.
 246     #
 247     # The dump is created with Marshal.dump;
 248     # In addition, it is gzipped using GZip.gzip.
 249     #
 250     # The returned String object includes Undumping
 251     # so it has an #undump method. See Tokens.load.
 252     #
 253     # You can configure the level of compression,
 254     # but the default value 7 should be what you want
 255     # in most cases as it is a good compromise between
 256     # speed and compression rate.
 257     #
 258     # See GZip module.
 259     def dump gzip_level = 7
 260       require 'coderay/helpers/gzip_simple'
 261       dump = Marshal.dump self
 262       dump = dump.gzip gzip_level
 263       dump.extend Undumping
 264     end
 265
 266     # The total size of the tokens.
 267     # Should be equal to the input size before
 268     # scanning.
 269     def text_size
 270       size = 0
 271       each_text_token do |t, k|
 272         size + t.size
 273       end
 274       size
 275     end
 276
 277     # The total size of the tokens.
 278     # Should be equal to the input size before
 279     # scanning.
 280     def text
 281       map { |t, k| t if t.is_a? ::String }.join
 282     end
 283
 284     # Include this module to give an object an #undump
 285     # method.
 286     #
 287     # The string returned by Tokens.dump includes Undumping.
 288     module Undumping
 289       # Calls Tokens.load with itself.
 290       def undump
 291         Tokens.load self
 292       end
 293     end
 294
 295     # Undump the object using Marshal.load, then
 296     # unzip it using GZip.gunzip.
 297     #
 298     # The result is commonly a Tokens object, but
 299     # this is not guaranteed.
 300     def Tokens.load dump
 301       require 'coderay/helpers/gzip_simple'
 302       dump = dump.gunzip
 303       @dump = Marshal.load dump
 304     end
 305
 306   end
 307
 308
 309   # = TokenStream
 310   #
 311   # The TokenStream class is a fake Array without elements.
 312   #
 313   # It redirects the method << to a block given at creation.
 314   #
 315   # This allows scanners and Encoders to use streaming (no
 316   # tokens are saved, the input is highlighted the same time it
 317   # is scanned) with the same code.
 318   #
 319   # See CodeRay.encode_stream and CodeRay.scan_stream
 320   class TokenStream < Tokens
 321
 322     # Whether the object is a TokenStream.
 323     #
 324     # Returns true.
 325     def stream?
 326       true
 327     end
 328
 329     # The Array is empty, but size counts the tokens given by <<.
 330     attr_reader :size
 331
 332     # Creates a new TokenStream that calls +block+ whenever
 333     # its << method is called.
 334     #
 335     # Example:
 336     #
 337     #   require 'coderay'
 338     #
 339     #   token_stream = CodeRay::TokenStream.new do |kind, text|
 340     #     puts 'kind: %s, text size: %d.' % [kind, text.size]
 341     #   end
 342     #
 343     #   token_stream << [:regexp, '/\d+/']
 344     #   #-> kind: rexpexp, text size: 5.
 345     #
 346     def initialize &block
 347       raise ArgumentError, 'Block expected for streaming.' unless block
 348       @callback = block
 349       @size = 0
 350     end
 351
 352     # Calls +block+ with +token+ and increments size.
 353     #
 354     # Returns self.
 355     def << token
 356       @callback.call token
 357       @size += 1
 358       self
 359     end
 360
 361     # This method is not implemented due to speed reasons. Use Tokens.
 362     def text_size
 363       raise NotImplementedError,
 364         'This method is not implemented due to speed reasons.'
 365     end
 366
 367     # A TokenStream cannot be dumped. Use Tokens.
 368     def dump
 369       raise NotImplementedError, 'A TokenStream cannot be dumped.'
 370     end
 371
 372     # A TokenStream cannot be optimized. Use Tokens.
 373     def optimize
 374       raise NotImplementedError, 'A TokenStream cannot be optimized.'
 375     end
 376
 377   end
 378
 379
 380   # Token name abbreviations
 381   require 'coderay/token_classes'
 382
 383 end