5 # The Tokens class represents a list of tokens returnd from
8 # A token is not a special object, just a two-element Array
10 # * the _token_ _kind_ (a Symbol representing the type of the token)
11 # * the _token_ _text_ (the original source of the token in a String)
13 # A token looks like this:
15 # [:comment, '# It looks like this']
16 # [:float, '3.1415926']
19 # Some scanners also yield some kind of sub-tokens, represented by special
20 # token texts, namely :open and :close .
22 # The Ruby scanner, for example, splits "a string" into:
27 # [:content, 'a string'],
32 # Tokens is also the interface between Scanners and Encoders:
33 # The input is split and saved into a Tokens object. The Encoder
34 # then builds the output from this object.
36 # Thus, the syntax below becomes clear:
38 # CodeRay.scan('price = 2.59', :ruby).html
39 # # the Tokens object is here -------^
41 # See how small it is? ;)
43 # Tokens gives you the power to handle pre-scanned code very easily:
44 # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
45 # that you put in your DB.
47 # Tokens' subclass TokenStream allows streaming to save memory.
52 # Convert the token to a string.
54 # This format is used by Encoders.Tokens.
55 # It can be reverted using read_token.
56 def write_token text, type
58 "#{type}\t#{escape(text)}\n"
60 ":#{text}\t#{type}\t\n"
64 # Read a token from the string.
66 # Inversion of write_token.
70 type, text = token.split("\t", 2)
72 [text.to_sym, type[1..-1].to_sym]
74 [type.to_sym, unescape(text)]
78 # Escapes a string for use in write_token.
80 text.gsub(/[\n\\]/, '\\\\\&')
83 # Unescapes a string created by escape.
85 text.gsub(/\\[\n\\]/) { |m| m[1,1] }
90 # Whether the object is a TokenStream.
97 # Iterates over all tokens.
99 # If a filter is given, only tokens of that kind are yielded.
100 def each kind_filter = nil, &block
104 super() do |text, kind|
105 next unless kind == kind_filter
111 # Iterates over all text tokens.
112 # Range tokens like [:open, :string] are left out.
115 # tokens.each_text_token { |text, kind| text.replace html_escape(text) }
118 next unless text.is_a? ::String
123 # Encode the tokens using encoder.
126 # * a symbol like :html oder :statistic
128 # * an Encoder object
130 # options are passed to the encoder.
131 def encode encoder, options = {}
132 unless encoder.is_a? Encoders::Encoder
133 unless encoder.is_a? Class
134 encoder_class = Encoders[encoder]
136 encoder = encoder_class.new options
138 encoder.encode_tokens self, options
142 # Turn into a string using Encoders::Text.
144 # +options+ are passed to the encoder if given.
145 def to_s options = {}
146 encode :text, options
150 # Redirects unknown methods to encoder calls.
152 # For example, if you call +tokens.html+, the HTML encoder
153 # is used to highlight the tokens.
154 def method_missing meth, options = {}
155 Encoders[meth].new(options).encode_tokens self
158 # Returns the tokens compressed by joining consecutive
159 # tokens of the same kind.
161 # This can not be undone, but should yield the same output
162 # in most Encoders. It basically makes the output smaller.
164 # Combined with dump, it saves space for the cost of time.
166 # If the scanner is written carefully, this is not required -
167 # for example, consecutive //-comment lines could already be
168 # joined in one comment token by the Scanner.
170 print ' Tokens#optimize: before: %d - ' % size if $DEBUG
171 last_kind = last_text = nil
173 for text, kind in self
178 new << [last_text, last_kind] if last_kind
183 new << [last_text, last_kind] if last_kind
184 last_kind = last_text = nil
188 new << [last_text, last_kind] if last_kind
189 print 'after: %d (%d saved = %2.0f%%)' %
190 [new.size, size - new.size, 1.0 - (new.size.to_f / size)] if $DEBUG
194 # Compact the object itself; see optimize.
199 # Ensure that all :open tokens have a correspondent :close one.
203 # Check token nesting using a stack of kinds.
205 for token, kind in self
208 elsif token == :close
209 expected = opened.pop
211 # Unexpected :close; decide what to do based on the kind:
212 # - token was opened earlier: also close tokens in between
213 # - token was never opened: delete the :close (skip with next)
214 next unless opened.rindex expected
215 tokens << [:close, kind] until (kind = opened.pop) == expected
218 tokens << [token, kind]
220 # Close remaining opened tokens
221 tokens << [:close, kind] while kind = opened.pop
230 # - newlines are single tokens
231 # (which means all other token are single-line)
232 # - there are no open tokens at the end the line
234 # This makes it simple for encoders that work line-oriented,
235 # like HTML with list-style numeration.
237 raise NotImplementedError
240 def split_into_lines!
241 replace split_into_lines
244 # Dumps the object into a String that can be saved
245 # in files or databases.
247 # The dump is created with Marshal.dump;
248 # In addition, it is gzipped using GZip.gzip.
250 # The returned String object includes Undumping
251 # so it has an #undump method. See Tokens.load.
253 # You can configure the level of compression,
254 # but the default value 7 should be what you want
255 # in most cases as it is a good compromise between
256 # speed and compression rate.
259 def dump gzip_level = 7
260 require 'coderay/helpers/gzip_simple'
261 dump = Marshal.dump self
262 dump = dump.gzip gzip_level
263 dump.extend Undumping
266 # The total size of the tokens.
267 # Should be equal to the input size before
271 each_text_token do |t, k|
277 # The total size of the tokens.
278 # Should be equal to the input size before
281 map { |t, k| t if t.is_a? ::String }.join
284 # Include this module to give an object an #undump
287 # The string returned by Tokens.dump includes Undumping.
289 # Calls Tokens.load with itself.
295 # Undump the object using Marshal.load, then
296 # unzip it using GZip.gunzip.
298 # The result is commonly a Tokens object, but
299 # this is not guaranteed.
301 require 'coderay/helpers/gzip_simple'
303 @dump = Marshal.load dump
311 # The TokenStream class is a fake Array without elements.
313 # It redirects the method << to a block given at creation.
315 # This allows scanners and Encoders to use streaming (no
316 # tokens are saved, the input is highlighted the same time it
317 # is scanned) with the same code.
319 # See CodeRay.encode_stream and CodeRay.scan_stream
320 class TokenStream < Tokens
322 # Whether the object is a TokenStream.
329 # The Array is empty, but size counts the tokens given by <<.
332 # Creates a new TokenStream that calls +block+ whenever
333 # its << method is called.
339 # token_stream = CodeRay::TokenStream.new do |kind, text|
340 # puts 'kind: %s, text size: %d.' % [kind, text.size]
343 # token_stream << [:regexp, '/\d+/']
344 # #-> kind: rexpexp, text size: 5.
346 def initialize &block
347 raise ArgumentError, 'Block expected for streaming.' unless block
352 # Calls +block+ with +token+ and increments size.
361 # This method is not implemented due to speed reasons. Use Tokens.
363 raise NotImplementedError,
364 'This method is not implemented due to speed reasons.'
367 # A TokenStream cannot be dumped. Use Tokens.
369 raise NotImplementedError, 'A TokenStream cannot be dumped.'
372 # A TokenStream cannot be optimized. Use Tokens.
374 raise NotImplementedError, 'A TokenStream cannot be optimized.'
380 # Token name abbreviations
381 require 'coderay/token_classes'