001 package org.maltparser.core.symbol.trie; 002 003 import java.io.BufferedReader; 004 import java.io.BufferedWriter; 005 import java.io.FileInputStream; 006 import java.io.FileNotFoundException; 007 import java.io.FileOutputStream; 008 import java.io.UnsupportedEncodingException; 009 010 import java.io.IOException; 011 import java.io.InputStreamReader; 012 import java.io.OutputStreamWriter; 013 import java.util.HashMap; 014 import java.util.Set; 015 import java.util.regex.Pattern; 016 import java.util.regex.PatternSyntaxException; 017 018 import org.apache.log4j.Logger; 019 020 import org.maltparser.core.exception.MaltChainedException; 021 import org.maltparser.core.symbol.SymbolException; 022 import org.maltparser.core.symbol.SymbolTable; 023 import org.maltparser.core.symbol.SymbolTableHandler; 024 025 026 /** 027 028 @author Johan Hall 029 @since 1.0 030 */ 031 public class TrieSymbolTableHandler implements SymbolTableHandler { 032 private Trie trie; 033 private HashMap<String, TrieSymbolTable> symbolTables; 034 035 public TrieSymbolTableHandler() { 036 trie = new Trie(); 037 symbolTables = new HashMap<String, TrieSymbolTable>(); 038 } 039 040 public TrieSymbolTable addSymbolTable(String tableName) throws MaltChainedException { 041 TrieSymbolTable symbolTable = symbolTables.get(tableName); 042 if (symbolTable == null) { 043 symbolTable = new TrieSymbolTable(tableName, trie); 044 symbolTables.put(tableName, symbolTable); 045 } 046 return symbolTable; 047 } 048 049 public TrieSymbolTable addSymbolTable(String tableName, SymbolTable parentTable) throws MaltChainedException { 050 TrieSymbolTable symbolTable = symbolTables.get(tableName); 051 if (symbolTable == null) { 052 TrieSymbolTable trieParentTable = (TrieSymbolTable)parentTable; 053 symbolTable = new TrieSymbolTable(tableName, trie, trieParentTable.getColumnCategory(), trieParentTable.getNullValueStrategy()); 054 symbolTables.put(tableName, symbolTable); 055 } 056 return symbolTable; 057 } 058 059 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy) throws MaltChainedException { 060 TrieSymbolTable symbolTable = symbolTables.get(tableName); 061 if (symbolTable == null) { 062 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy); 063 symbolTables.put(tableName, symbolTable); 064 } 065 return symbolTable; 066 } 067 068 public TrieSymbolTable addSymbolTable(String tableName, int columnCategory, String nullValueStrategy, String rootLabel) throws MaltChainedException { 069 TrieSymbolTable symbolTable = symbolTables.get(tableName); 070 if (symbolTable == null) { 071 symbolTable = new TrieSymbolTable(tableName, trie, columnCategory, nullValueStrategy, rootLabel); 072 symbolTables.put(tableName, symbolTable); 073 } 074 return symbolTable; 075 } 076 077 public TrieSymbolTable getSymbolTable(String tableName) { 078 return symbolTables.get(tableName); 079 } 080 081 public Set<String> getSymbolTableNames() { 082 return symbolTables.keySet(); 083 } 084 085 public void save(OutputStreamWriter osw) throws MaltChainedException { 086 try { 087 BufferedWriter bout = new BufferedWriter(osw); 088 for (TrieSymbolTable table : symbolTables.values()) { 089 table.saveHeader(bout); 090 } 091 bout.write('\n'); 092 for (TrieSymbolTable table : symbolTables.values()) { 093 table.save(bout); 094 } 095 bout.close(); 096 } catch (IOException e) { 097 throw new SymbolException("Could not save the symbol tables. ", e); 098 } 099 } 100 101 public void save(String fileName, String charSet) throws MaltChainedException { 102 try { 103 save(new OutputStreamWriter(new FileOutputStream(fileName), charSet)); 104 } catch (FileNotFoundException e) { 105 throw new SymbolException("The symbol table file '"+fileName+"' cannot be created. ", e); 106 } catch (UnsupportedEncodingException e) { 107 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 108 } 109 } 110 111 public void loadHeader(BufferedReader bin) throws MaltChainedException { 112 String fileLine = ""; 113 Pattern tabPattern = Pattern.compile("\t"); 114 try { 115 while ((fileLine = bin.readLine()) != null) { 116 if (fileLine.length() == 0 || fileLine.charAt(0) != '\t') { 117 break; 118 } 119 String items[]; 120 try { 121 items = tabPattern.split(fileLine.substring(1)); 122 } catch (PatternSyntaxException e) { 123 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' could not split into atomic parts. ", e); 124 } 125 if (items.length != 4) { 126 throw new SymbolException("The header line of the symbol table '"+fileLine.substring(1)+"' must contain four columns. "); 127 } 128 if (items[3].equals("#DUMMY#")) { 129 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2]); 130 } else { 131 addSymbolTable(items[0], Integer.parseInt(items[1]), items[2], items[3]); 132 } 133 } 134 } catch (NumberFormatException e) { 135 throw new SymbolException("The symbol table file (.sym) contains a non-integer value in the header. ", e); 136 } catch (IOException e) { 137 throw new SymbolException("Could not load the symbol table. ", e); 138 } 139 } 140 141 142 public void load(InputStreamReader isr) throws MaltChainedException { 143 try { 144 BufferedReader bin = new BufferedReader(isr); 145 String fileLine; 146 SymbolTable table = null; 147 bin.mark(2); 148 if (bin.read() == '\t') { 149 bin.reset(); 150 loadHeader(bin); 151 } else { 152 bin.reset(); 153 } 154 while ((fileLine = bin.readLine()) != null) { 155 if (fileLine.length() > 0) { 156 table = addSymbolTable(fileLine); 157 table.load(bin); 158 } 159 } 160 bin.close(); 161 } catch (IOException e) { 162 throw new SymbolException("Could not load the symbol tables. ", e); 163 } 164 } 165 166 public void load(String fileName, String charSet) throws MaltChainedException { 167 try { 168 load(new InputStreamReader(new FileInputStream(fileName), charSet)); 169 170 } catch (FileNotFoundException e) { 171 throw new SymbolException("The symbol table file '"+fileName+"' cannot be found. ", e); 172 } catch (UnsupportedEncodingException e) { 173 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 174 } 175 } 176 177 178 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy) throws MaltChainedException { 179 try { 180 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet)); 181 String fileLine; 182 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy); 183 184 while ((fileLine = br.readLine()) != null) { 185 table.addSymbol(fileLine.trim()); 186 } 187 return table; 188 } catch (FileNotFoundException e) { 189 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e); 190 } catch (UnsupportedEncodingException e) { 191 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 192 } catch (IOException e) { 193 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e); 194 } 195 } 196 197 public SymbolTable loadTagset(String fileName, String tableName, String charSet, int columnCategory, String nullValueStrategy, String rootLabel) throws MaltChainedException { 198 try { 199 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charSet)); 200 String fileLine; 201 TrieSymbolTable table = addSymbolTable(tableName, columnCategory, nullValueStrategy, rootLabel); 202 203 while ((fileLine = br.readLine()) != null) { 204 table.addSymbol(fileLine.trim()); 205 } 206 return table; 207 } catch (FileNotFoundException e) { 208 throw new SymbolException("The tagset file '"+fileName+"' cannot be found. ", e); 209 } catch (UnsupportedEncodingException e) { 210 throw new SymbolException("The char set '"+charSet+"' is not supported. ", e); 211 } catch (IOException e) { 212 throw new SymbolException("The tagset file '"+fileName+"' cannot be loaded. ", e); 213 } 214 } 215 216 public void printSymbolTables(Logger logger) throws MaltChainedException { 217 for (TrieSymbolTable table : symbolTables.values()) { 218 table.printSymbolTable(logger); 219 } 220 } 221 }