--- /dev/null
+# encoding: us-ascii
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jeff Hodges - port to Ruby
+# Mark Pilgrim - port to Python
+# Shy Shalom - original C code
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301 USA
+######################### END LICENSE BLOCK #########################
+
+module CharDet
+ MINIMUM_THRESHOLD = 0.20
+ EPureAscii = 0
+ EEscAscii = 1
+ EHighbyte = 2
+
+ class UniversalDetector
+ attr_accessor :result
+ def initialize
+ @_highBitDetector = /[\x80-\xFF]/
+ @_escDetector = /(\033|\~\{)/
+ @_mEscCharSetProber = nil
+ @_mCharSetProbers = []
+ reset()
+ end
+
+ def reset
+ @result = {'encoding' => nil, 'confidence' => 0.0}
+ @done = false
+ @_mStart = true
+ @_mGotData = false
+ @_mInputState = EPureAscii
+ @_mLastChar = ''
+ if @_mEscCharSetProber
+ @_mEscCharSetProber.reset()
+ end
+ for prober in @_mCharSetProbers
+ prober.reset()
+ end
+ end
+
+ def feed(aBuf)
+ return if @done
+
+ aLen = aBuf.length
+ return if not aLen
+
+ if not @_mGotData
+ # If the data starts with BOM, we know it is UTF
+ if aBuf[0...3] == "\xEF\xBB\xBF"
+ # EF BB BF UTF-8 with BOM
+ @result = {'encoding' => "UTF-8", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\xFF\xFE\x00\x00"
+ # FF FE 00 00 UTF-32, little-endian BOM
+ @result = {'encoding' => "UTF-32LE", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\x00\x00\xFE\xFF"
+ # 00 00 FE FF UTF-32, big-endian BOM
+ @result = {'encoding' => "UTF-32BE", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\xFE\xFF\x00\x00"
+ # FE FF 00 00 UCS-4, unusual octet order BOM (3412)
+ @result = {'encoding' => "X-ISO-10646-UCS-4-3412", 'confidence' => 1.0}
+ elsif aBuf[0...4] == "\x00\x00\xFF\xFE"
+ # 00 00 FF FE UCS-4, unusual octet order BOM (2143)
+ @result = {'encoding' => "X-ISO-10646-UCS-4-2143", 'confidence' => 1.0}
+ elsif aBuf[0...2] == "\xFF\xFE"
+ # FF FE UTF-16, little endian BOM
+ @result = {'encoding' => "UTF-16LE", 'confidence' => 1.0}
+ elsif aBuf[0...2] == "\xFE\xFF"
+ # FE FF UTF-16, big endian BOM
+ @result = {'encoding' => "UTF-16BE", 'confidence' => 1.0}
+ end
+ end
+
+ @_mGotData = true
+ if @result['encoding'] and (@result['confidence'] > 0.0)
+ @done = true
+ return
+ end
+
+ if @_mInputState == EPureAscii
+ if @_highBitDetector =~ (aBuf)
+ @_mInputState = EHighbyte
+ elsif (@_mInputState == EPureAscii) and @_escDetector =~ (@_mLastChar + aBuf)
+ @_mInputState = EEscAscii
+ end
+ end
+
+ @_mLastChar = aBuf[-1..-1]
+ if @_mInputState == EEscAscii
+ if not @_mEscCharSetProber
+ @_mEscCharSetProber = EscCharSetProber.new()
+ end
+ if @_mEscCharSetProber.feed(aBuf) == EFoundIt
+ @result = {'encoding' => self._mEscCharSetProber.get_charset_name(),
+ 'confidence' => @_mEscCharSetProber.get_confidence()
+ }
+ @done = true
+ end
+ elsif @_mInputState == EHighbyte
+ if not @_mCharSetProbers or @_mCharSetProbers.empty?
+ @_mCharSetProbers = [MBCSGroupProber.new(), SBCSGroupProber.new(), Latin1Prober.new()]
+ end
+ for prober in @_mCharSetProbers
+ if prober.feed(aBuf) == EFoundIt
+ @result = {'encoding' => prober.get_charset_name(),
+ 'confidence' => prober.get_confidence()}
+ @done = true
+ break
+ end
+ end
+ end
+
+ end
+
+ def close
+ return if @done
+ if not @_mGotData
+ $stderr << "no data received!\n" if $debug
+ return
+ end
+ @done = true
+
+ if @_mInputState == EPureAscii
+ @result = {'encoding' => 'ascii', 'confidence' => 1.0}
+ return @result
+ end
+
+ if @_mInputState == EHighbyte
+ confidences = {}
+ @_mCharSetProbers.each{ |prober| confidences[prober] = prober.get_confidence }
+ maxProber = @_mCharSetProbers.max{ |a,b| confidences[a] <=> confidences[b] }
+ if maxProber and maxProber.get_confidence > MINIMUM_THRESHOLD
+ @result = {'encoding' => maxProber.get_charset_name(),
+ 'confidence' => maxProber.get_confidence()}
+ return @result
+ end
+ end
+
+ if $debug
+ $stderr << "no probers hit minimum threshhold\n" if $debug
+ for prober in @_mCharSetProbers[0]._mProbers
+ next if not prober
+ $stderr << "#{prober.get_charset_name} confidence = #{prober.get_confidence}\n" if $debug
+ end
+ end
+ end
+ end
+end