src/com/swabunga/spell/engine/SpellDictionaryDisk.java

   1 /*\r
   2 Jazzy - a Java library for Spell Checking\r
   3 Copyright (C) 2001 Mindaugas Idzelis\r
   4 Full text of license can be found in LICENSE.txt\r
   5 \r
   6 This library is free software; you can redistribute it and/or\r
   7 modify it under the terms of the GNU Lesser General Public\r
   8 License as published by the Free Software Foundation; either\r
   9 version 2.1 of the License, or (at your option) any later version.\r
  10 \r
  11 This library is distributed in the hope that it will be useful,\r
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of\r
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\r
  14 Lesser General Public License for more details.\r
  15 \r
  16 You should have received a copy of the GNU Lesser General Public\r
  17 License along with this library; if not, write to the Free Software\r
  18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA\r
  19 */\r
  20 /* Created by bgalbs on Jan 30, 2003 at 11:38:39 PM */\r
  21 package com.swabunga.spell.engine;\r
  22 \r
  23 import java.io.BufferedOutputStream;\r
  24 import java.io.BufferedReader;\r
  25 import java.io.BufferedWriter;\r
  26 import java.io.File;\r
  27 import java.io.FileInputStream;\r
  28 import java.io.FileNotFoundException;\r
  29 import java.io.FileOutputStream;\r
  30 import java.io.FileReader;\r
  31 import java.io.FileWriter;\r
  32 import java.io.IOException;\r
  33 import java.io.InputStream;\r
  34 import java.util.ArrayList;\r
  35 import java.util.Collections;\r
  36 import java.util.HashMap;\r
  37 import java.util.List;\r
  38 import java.util.Map;\r
  39 import java.util.StringTokenizer;\r
  40 import java.util.Vector;\r
  41 \r
  42 /**\r
  43  * An implementation of <code>SpellDictionary</code> that doesn't cache any words in memory. Avoids the huge\r
  44  * footprint of <code>SpellDictionaryHashMap</code> at the cost of relatively minor latency. A future version\r
  45  * of this class that implements some caching strategies might be a good idea in the future, if there's any\r
  46  * demand for it.\r
  47  * <p>\r
  48  * This class makes use of the "classic" Java IO library (java.io). However, it could probably benefit from\r
  49  * the new IO APIs (java.nio) and it is anticipated that a future version of this class, probably called\r
  50  * <code>SpellDictionaryDiskNIO</code> will appear at some point.\r
  51  *\r
  52  * @author Ben Galbraith (ben@galbraiths.org)\r
  53  * @version 0.1\r
  54  * @since 0.5\r
  55  */\r
  56 public class SpellDictionaryDisk extends SpellDictionaryASpell {\r
  57   private final static String DIRECTORY_WORDS = "words";\r
  58   private final static String DIRECTORY_DB = "db";\r
  59   private final static String FILE_CONTENTS = "contents";\r
  60   private final static String FILE_DB = "words.db";\r
  61   private final static String FILE_INDEX = "words.idx";\r
  62 \r
  63   /* maximum number of words an index entry can represent */\r
  64   private final static int INDEX_SIZE_MAX = 200;\r
  65 \r
  66   private final File base;\r
  67   private final File words;\r
  68   private final File db;\r
  69   @SuppressWarnings("unchecked")\r
  70 private Map index;\r
  71   /**\r
  72    * The flag indicating if the initial preparation or loading of the on \r
  73    * disk dictionary is complete.\r
  74    */\r
  75   protected boolean ready;\r
  76 \r
  77   /* used at time of creation of index to speed up determining the number of words per index entry */\r
  78   @SuppressWarnings("unchecked")\r
  79 private List indexCodeCache = null;\r
  80 \r
  81   /**\r
  82    * Construct a spell dictionary on disk. \r
  83    * The spell dictionary is created from words list(s) contained in file(s).\r
  84    * A words list file is a file with one word per line. Words list files are\r
  85    * located in a <code>base/words</code> dictionary where <code>base</code> \r
  86    * is the path to <code>words</code> dictionary. The on disk spell \r
  87    * dictionary is created in <code>base/db</code> dictionary and contains \r
  88    * files:\r
  89    * <ul>\r
  90    * <li><code>contents</code> list the words files used for spelling.</li>\r
  91    * <li><code>words.db</code> the content of words files organized as\r
  92    * a <em>database</em> of words.</li>\r
  93    * <li><code>words.idx</code> an index file to the <code>words.db</code>\r
  94    * file content.</li>\r
  95    * </ul>\r
  96    * The <code>contents</code> file has a list of \r
  97    * <code>filename, size</code> indicating the name and length of each files\r
  98    * in the <code>base/words</code> dictionary. If one of theses files was \r
  99    * changed, added or deleted before the call to the constructor, the process \r
 100    * of producing new or updated <code>words.db</code> and \r
 101    * <code>words.idx</code> files is started again.\r
 102    * <p/>\r
 103    * The spellchecking process is then worked upon the <code>words.db</code>\r
 104    * and <code>words.idx</code> files.\r
 105    * <p/>\r
 106    * \r
 107    * NOTE: Do *not* create two instances of this class pointing to the same <code>base</code> unless\r
 108    * you are sure that a new dictionary does not have to be created. In the future, some sort of\r
 109    * external locking mechanism may be created that handles this scenario gracefully.\r
 110    * \r
 111    * @param base the base directory in which <code>SpellDictionaryDisk</code> can expect to find\r
 112    * its necessary files.\r
 113    * @param phonetic the phonetic file used by the spellchecker.\r
 114    * @param block if a new word db needs to be created, there can be a considerable delay before\r
 115    * the constructor returns. If block is true, this method will block while the db is created\r
 116    * and return when done. If block is false, this method will create a thread to create the new\r
 117    * dictionary and return immediately.\r
 118    * @throws java.io.FileNotFoundException indicates problems locating the\r
 119    * files on the system\r
 120    * @throws java.io.IOException indicates problems reading the files\r
 121    */\r
 122   public SpellDictionaryDisk(File base, File phonetic, boolean block) throws FileNotFoundException, IOException {\r
 123     super(phonetic);\r
 124     this.ready = false;\r
 125 \r
 126     this.base = base;\r
 127     this.words = new File(base, DIRECTORY_WORDS);\r
 128     this.db = new File(base, DIRECTORY_DB);\r
 129 \r
 130     if (!this.base.exists()) throw new FileNotFoundException("Couldn't find required path '" + this.base + "'");\r
 131     if (!this.words.exists()) throw new FileNotFoundException("Couldn't find required path '" + this.words + "'");\r
 132     if (!this.db.exists()) db.mkdirs();\r
 133 \r
 134     if (newDictionaryFiles()) {\r
 135       if (block) {\r
 136         buildNewDictionaryDatabase();\r
 137         loadIndex();\r
 138         ready = true;\r
 139       } else {\r
 140         Thread t = new Thread() {\r
 141           @Override\r
 142                 public void run() {\r
 143             try {\r
 144               buildNewDictionaryDatabase();\r
 145               loadIndex();\r
 146               ready = true;\r
 147             } catch (Exception e) {\r
 148               e.printStackTrace();\r
 149             }\r
 150           }\r
 151         };\r
 152         t.start();\r
 153       }\r
 154     } else {\r
 155       loadIndex();\r
 156       ready = true;\r
 157     }\r
 158   }\r
 159 \r
 160   /**\r
 161    * Builds the file words database file and the contents file for the on\r
 162    * disk dictionary.\r
 163    */\r
 164   protected void buildNewDictionaryDatabase() throws FileNotFoundException, IOException {\r
 165     /* combine all dictionary files into one sorted file */\r
 166     File sortedFile = buildSortedFile();\r
 167 \r
 168     /* create the db for the sorted file */\r
 169     buildCodeDb(sortedFile);\r
 170     sortedFile.delete();\r
 171 \r
 172     /* build contents file */\r
 173     buildContentsFile();\r
 174   }\r
 175 \r
 176   /**\r
 177    * Adds another word to the dictionary. <em>This method is  not yet implemented\r
 178    * for this class</em>.\r
 179    * @param word The word to add.\r
 180    */\r
 181   public void addWord(String word) {\r
 182     throw new UnsupportedOperationException("addWord not yet implemented (sorry)");\r
 183   }\r
 184 \r
 185   /**\r
 186    * Returns a list of words that have the same phonetic code.\r
 187    * @param code The phonetic code common to the list of words\r
 188    * @return A list of words having the same phonetic code\r
 189    */\r
 190   @Override\r
 191 @SuppressWarnings("unchecked")\r
 192 public List getWords(String code) {\r
 193     Vector words = new Vector();\r
 194 \r
 195     int[] posLen = getStartPosAndLen(code);\r
 196     if (posLen != null) {\r
 197       try {\r
 198         InputStream input = new FileInputStream(new File(db, FILE_DB));\r
 199         input.skip(posLen[0]);\r
 200         byte[] bytes = new byte[posLen[1]];\r
 201         input.read(bytes, 0, posLen[1]);\r
 202         input.close();\r
 203 \r
 204         String data = new String(bytes);\r
 205         String[] lines = split(data, "\n");\r
 206         for (String line : lines) {\r
 207           String[] s = split(line, ",");\r
 208           if (s[0].equals(code)) words.addElement(s[1]);\r
 209         }\r
 210       } catch (Exception e) {\r
 211         e.printStackTrace();\r
 212       }\r
 213     }\r
 214 \r
 215     return words;\r
 216   }\r
 217 \r
 218   /**\r
 219    * Indicates if the initial preparation or loading of the on disk dictionary\r
 220    * is complete.\r
 221    * @return the indication that the dictionary initial setup is done.\r
 222    */\r
 223   public boolean isReady() {\r
 224     return ready;\r
 225   }\r
 226 \r
 227   @SuppressWarnings("unchecked")\r
 228 private boolean newDictionaryFiles() throws FileNotFoundException, IOException {\r
 229     /* load in contents file, which indicates the files and sizes of the last db build */\r
 230     List contents = new ArrayList();\r
 231     File c = new File(db, FILE_CONTENTS);\r
 232     if (c.exists()) {\r
 233       BufferedReader reader = null;\r
 234       try {\r
 235         reader = new BufferedReader(new FileReader(c));\r
 236         String line;\r
 237         while ((line = reader.readLine()) != null) {\r
 238           // format of file should be [filename],[size]\r
 239           String[] s = split(line, ",");\r
 240           contents.add(new FileSize(s[0], Integer.parseInt(s[1])));\r
 241         }\r
 242       } catch (FileNotFoundException e) {\r
 243         throw e;\r
 244       } catch (IOException e) {\r
 245         throw e;\r
 246       } finally {\r
 247         if (reader != null) reader.close();\r
 248       }\r
 249     }\r
 250 \r
 251     /* compare this to the actual directory */\r
 252     boolean changed = false;\r
 253     File[] wordFiles = words.listFiles();\r
 254     if (contents.size() != wordFiles.length) {\r
 255       // if the size of the contents list and the number of word files are different, it\r
 256       // means we've definitely got to reindex\r
 257       changed = true;\r
 258     } else {\r
 259       // check and make sure that all the word files haven't changed on us\r
 260       for (File wordFile : wordFiles) {\r
 261         FileSize fs = new FileSize(wordFile.getName(), wordFile.length());\r
 262         if (!contents.contains(fs)) {\r
 263           changed = true;\r
 264           break;\r
 265         }\r
 266       }\r
 267     }\r
 268 \r
 269     return changed;\r
 270   }\r
 271 \r
 272   @SuppressWarnings("unchecked")\r
 273 private File buildSortedFile() throws FileNotFoundException, IOException {\r
 274     List w = new ArrayList();\r
 275 \r
 276     /*\r
 277      * read every single word into the list. eeek. if this causes problems,\r
 278      * we may wish to explore disk-based sorting or more efficient memory-based storage\r
 279      */\r
 280     File[] wordFiles = words.listFiles();\r
 281     for (File wordFile : wordFiles) {\r
 282       BufferedReader r = new BufferedReader(new FileReader(wordFile));\r
 283       String word;\r
 284       while ((word = r.readLine()) != null) {\r
 285         if (!word.equals("")) {\r
 286           w.add(word.trim());\r
 287         }\r
 288       }\r
 289       r.close();\r
 290     }\r
 291 \r
 292     Collections.sort(w);\r
 293 \r
 294     // FIXME - error handling for running out of disk space would be nice.\r
 295     File file = File.createTempFile("jazzy", "sorted");\r
 296     BufferedWriter writer = new BufferedWriter(new FileWriter(file));\r
 297     String prev = null;\r
 298     for (int i = 0; i < w.size(); i++) {\r
 299       String word = (String) w.get(i);\r
 300       if (prev == null || !prev.equals(word)) {\r
 301         writer.write(word);\r
 302         writer.newLine();\r
 303       }\r
 304       prev = word;\r
 305     }\r
 306     writer.close();\r
 307 \r
 308     return file;\r
 309   }\r
 310 \r
 311   @SuppressWarnings("unchecked")\r
 312 private void buildCodeDb(File sortedWords) throws FileNotFoundException, IOException {\r
 313     List codeList = new ArrayList();\r
 314 \r
 315     BufferedReader reader = new BufferedReader(new FileReader(sortedWords));\r
 316     String word;\r
 317     while ((word = reader.readLine()) != null) {\r
 318       codeList.add(new CodeWord(this.getCode(word), word));\r
 319     }\r
 320     reader.close();\r
 321 \r
 322     Collections.sort(codeList);\r
 323 \r
 324     List index = new ArrayList();\r
 325 \r
 326     BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(new File(db, FILE_DB)));\r
 327     String currentCode = null;\r
 328     int currentPosition = 0;\r
 329     int currentLength = 0;\r
 330     for (int i = 0; i < codeList.size(); i++) {\r
 331       CodeWord cw = (CodeWord) codeList.get(i);\r
 332       String thisCode = cw.getCode();\r
 333 //            if (thisCode.length() > 3) thisCode = thisCode.substring(0, 3);\r
 334       thisCode = getIndexCode(thisCode, codeList);\r
 335       String toWrite = cw.getCode() + "," + cw.getWord() + "\n";\r
 336       byte[] bytes = toWrite.getBytes();\r
 337 \r
 338       if (currentCode == null) currentCode = thisCode;\r
 339       if (!currentCode.equals(thisCode)) {\r
 340         index.add(new Object[]{currentCode, new int[]{currentPosition, currentLength}});\r
 341         currentPosition += currentLength;\r
 342         currentLength = bytes.length;\r
 343         currentCode = thisCode;\r
 344       } else {\r
 345         currentLength += bytes.length;\r
 346       }\r
 347       out.write(bytes);\r
 348     }\r
 349     out.close();\r
 350 \r
 351     // Output the last iteration\r
 352     if (currentCode != null && currentPosition != 0 && currentLength != 0)\r
 353       index.add(new Object[]{currentCode, new int[]{currentPosition, currentLength}});\r
 354 \r
 355     BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_INDEX)));\r
 356     for (int i = 0; i < index.size(); i++) {\r
 357       Object[] o = (Object[]) index.get(i);\r
 358       writer.write(o[0].toString());\r
 359       writer.write(",");\r
 360       writer.write(String.valueOf(((int[]) o[1])[0]));\r
 361       writer.write(",");\r
 362       writer.write(String.valueOf(((int[]) o[1])[1]));\r
 363       writer.newLine();\r
 364     }\r
 365     writer.close();\r
 366   }\r
 367 \r
 368   private void buildContentsFile() throws IOException {\r
 369     File[] wordFiles = words.listFiles();\r
 370     if (wordFiles.length > 0) {\r
 371       BufferedWriter writer = new BufferedWriter(new FileWriter(new File(db, FILE_CONTENTS)));\r
 372       for (File wordFile : wordFiles) {\r
 373         writer.write(wordFile.getName());\r
 374         writer.write(",");\r
 375         writer.write(String.valueOf(wordFile.length()));\r
 376         writer.newLine();\r
 377       }\r
 378       writer.close();\r
 379     } else {\r
 380       new File(db, FILE_CONTENTS).delete();\r
 381     }\r
 382   }\r
 383 \r
 384   /**\r
 385    * Loads the index file from disk. The index file accelerates words lookup\r
 386    * into the dictionary db file.\r
 387    */\r
 388   @SuppressWarnings("unchecked")\r
 389 protected void loadIndex() throws IOException {\r
 390     index = new HashMap();\r
 391     File idx = new File(db, FILE_INDEX);\r
 392     BufferedReader reader = new BufferedReader(new FileReader(idx));\r
 393     String line;\r
 394     while ((line = reader.readLine()) != null) {\r
 395       String[] fields = split(line, ",");\r
 396       index.put(fields[0], new int[]{Integer.parseInt(fields[1]), Integer.parseInt(fields[2])});\r
 397     }\r
 398     reader.close();\r
 399   }\r
 400 \r
 401   private int[] getStartPosAndLen(String code) {\r
 402     while (code.length() > 0) {\r
 403       int[] posLen = (int[]) index.get(code);\r
 404       if (posLen == null) {\r
 405         code = code.substring(0, code.length() - 1);\r
 406       } else {\r
 407         return posLen;\r
 408       }\r
 409     }\r
 410     return null;\r
 411   }\r
 412 \r
 413   @SuppressWarnings("unchecked")\r
 414 private String getIndexCode(String code, List codes) {\r
 415     if (indexCodeCache == null) indexCodeCache = new ArrayList();\r
 416 \r
 417     if (code.length() <= 1) return code;\r
 418 \r
 419     for (int i = 0; i < indexCodeCache.size(); i++) {\r
 420       String c = (String) indexCodeCache.get(i);\r
 421       if (code.startsWith(c)) return c;\r
 422     }\r
 423 \r
 424     int foundSize = -1;\r
 425     boolean cacheable = false;\r
 426     for (int z = 1; z < code.length(); z++) {\r
 427       String thisCode = code.substring(0, z);\r
 428       int count = 0;\r
 429       for (int i = 0; i < codes.size();) {\r
 430         if (i == 0) {\r
 431           i = Collections.binarySearch(codes, new CodeWord(thisCode, ""));\r
 432           if (i < 0) i = 0;\r
 433         }\r
 434 \r
 435         CodeWord cw = (CodeWord) codes.get(i);\r
 436         if (cw.getCode().startsWith(thisCode)) {\r
 437           count++;\r
 438           if (count > INDEX_SIZE_MAX) break;\r
 439         } else if (cw.getCode().compareTo(thisCode) > 0) break;\r
 440         i++;\r
 441       }\r
 442       if (count <= INDEX_SIZE_MAX) {\r
 443         cacheable = true;\r
 444         foundSize = z;\r
 445         break;\r
 446       }\r
 447     }\r
 448 \r
 449     String newCode = (foundSize == -1) ? code : code.substring(0, foundSize);\r
 450     if (cacheable) indexCodeCache.add(newCode);\r
 451     return newCode;\r
 452   }\r
 453 \r
 454   private static String[] split(String input, String delimiter) {\r
 455     StringTokenizer st = new StringTokenizer(input, delimiter);\r
 456     int count = st.countTokens();\r
 457     String[] out = new String[count];\r
 458 \r
 459     for (int i = 0; i < count; i++) {\r
 460       out[i] = st.nextToken();\r
 461     }\r
 462 \r
 463     return out;\r
 464   }\r
 465 \r
 466   @SuppressWarnings("unchecked")\r
 467 private class CodeWord implements Comparable {\r
 468     private final String code;\r
 469     private final String word;\r
 470 \r
 471     public CodeWord(String code, String word) {\r
 472       this.code = code;\r
 473       this.word = word;\r
 474     }\r
 475 \r
 476     public String getCode() {\r
 477       return code;\r
 478     }\r
 479 \r
 480     public String getWord() {\r
 481       return word;\r
 482     }\r
 483 \r
 484     @Override\r
 485         public boolean equals(Object o) {\r
 486       if (this == o) return true;\r
 487       if (!(o instanceof CodeWord)) return false;\r
 488 \r
 489       final CodeWord codeWord = (CodeWord) o;\r
 490 \r
 491       if (!word.equals(codeWord.word)) return false;\r
 492 \r
 493       return true;\r
 494     }\r
 495 \r
 496     @Override\r
 497         public int hashCode() {\r
 498       return word.hashCode();\r
 499     }\r
 500 \r
 501     public int compareTo(Object o) {\r
 502       return code.compareTo(((CodeWord) o).getCode());\r
 503     }\r
 504   }\r
 505 \r
 506   private class FileSize {\r
 507     private final String filename;\r
 508     private final long size;\r
 509 \r
 510     public FileSize(String filename, long size) {\r
 511       this.filename = filename;\r
 512       this.size = size;\r
 513     }\r
 514 \r
 515     @SuppressWarnings("unused")\r
 516         public String getFilename() {\r
 517       return filename;\r
 518     }\r
 519 \r
 520     @SuppressWarnings("unused")\r
 521         public long getSize() {\r
 522       return size;\r
 523     }\r
 524 \r
 525     @Override\r
 526         public boolean equals(Object o) {\r
 527       if (this == o) return true;\r
 528       if (!(o instanceof FileSize)) return false;\r
 529 \r
 530       final FileSize fileSize = (FileSize) o;\r
 531 \r
 532       if (size != fileSize.size) return false;\r
 533       if (!filename.equals(fileSize.filename)) return false;\r
 534 \r
 535       return true;\r
 536     }\r
 537 \r
 538     @Override\r
 539         public int hashCode() {\r
 540       int result;\r
 541       result = filename.hashCode();\r
 542       result = (int) (29 * result + size);\r
 543       return result;\r
 544     }\r
 545   }\r
 546 }\r