1   package eu.fbk.dkm.premon.util;
2   
3   import com.google.common.base.Charsets;
4   import com.google.common.base.Objects;
5   import com.google.common.base.Preconditions;
6   import com.google.common.collect.*;
7   import com.google.common.io.Resources;
8   import eu.fbk.rdfpro.util.Environment;
9   import net.didion.jwnl.JWNL;
10  import net.didion.jwnl.JWNLException;
11  import net.didion.jwnl.data.*;
12  import net.didion.jwnl.dictionary.Dictionary;
13  
14  import javax.annotation.Nullable;
15  import java.io.ByteArrayInputStream;
16  import java.io.InputStream;
17  import java.util.*;
18  
19  public final class WordNet {
20  
21      public static final String POS_NOUN = "n";
22  
23      public static final String POS_VERB = "v";
24  
25      public static final String POS_ADJECTIVE = "a";
26  
27      public static final String POS_ADVERB = "r";
28  
29      private static final Map<String, String> BBN_TO_SYNSET;
30  
31      private static final Map<String, List<String>> SYNSET_TO_BBN; // built from bbnToSynset
32  
33      private static final Map<String, String> BBN_TO_SST;
34  
35      private static final Map<String, String> SYNSET_TO_SST; // contains partial mapping overriding
36      // lexicographer file
37  
38      private static String dictionaryPath = Objects.firstNonNull(
39              Environment.getProperty("wordnet.home"), "wordnet");
40  
41      private static Dictionary dictionary;
42  
43      static {
44          // TODO: need better mapping
45          final Map<String, String> bbnToSynset = Maps.newLinkedHashMap();
46          bbnToSynset.put("person", "00007846-n");
47          bbnToSynset.put("organization", "08008335-n"); // was 07950920
48          bbnToSynset.put("gpe", "00027167-n");
49          bbnToSynset.put("location", "00027167-n");
50          bbnToSynset.put("event", "00029378-n");
51          bbnToSynset.put("product", "04007894-n");
52          bbnToSynset.put("fac", "03315023-n");
53          bbnToSynset.put("work_of_art", "02743547-n");
54          bbnToSynset.put("law", "06532330-n");
55          bbnToSynset.put("language", "06282651-n");
56          bbnToSynset.put("quantity", "00033615-n");
57          bbnToSynset.put("date", "15113229-n");
58          bbnToSynset.put("time", "15113229-n");
59          bbnToSynset.put("percent", "13817526-n");
60          bbnToSynset.put("money", "13384557-n");
61          bbnToSynset.put("ordinal", "14429985-n");
62          bbnToSynset.put("cardinal", "13582013-n");
63          BBN_TO_SYNSET = ImmutableMap.copyOf(bbnToSynset);
64  
65          final Map<String, List<String>> synsetToBbn = Maps.newHashMap();
66          for (final Map.Entry<String, String> entry : bbnToSynset.entrySet()) {
67              final String bbn = entry.getKey();
68              final String synset = entry.getValue();
69              final List<String> list = synsetToBbn.get(synset);
70              if (list == null) {
71                  synsetToBbn.put(synset, ImmutableList.of(bbn));
72              } else {
73                  synsetToBbn.put(
74                          synset,
75                          Ordering.natural().immutableSortedCopy(
76                                  Iterables.concat(list, ImmutableList.of(bbn))));
77              }
78          }
79          SYNSET_TO_BBN = ImmutableMap.copyOf(synsetToBbn);
80  
81          final Map<String, String> bbnToSst = Maps.newLinkedHashMap();
82          bbnToSst.put("person", "B-noun.person");
83          bbnToSst.put("organization", "B-noun.group");
84          bbnToSst.put("gpe", "B-noun.location");
85          bbnToSst.put("location", "B-noun.location");
86          bbnToSst.put("event", "B-noun.event");
87          bbnToSst.put("product", "B-noun.artifact");
88          bbnToSst.put("fac", "B-noun.artifact");
89          bbnToSst.put("work_of_art", "B-noun.artifact");
90          bbnToSst.put("law", "B-noun.communication");
91          bbnToSst.put("language", "B-noun.communication");
92          bbnToSst.put("quantity", "B-noun.quantity");
93          bbnToSst.put("date", "B-noun.time");
94          bbnToSst.put("time", "B-noun.time");
95          bbnToSst.put("percent", "B-noun.relation");
96          bbnToSst.put("money", "B-noun.possession");
97          bbnToSst.put("ordinal", "B-noun.state");
98          bbnToSst.put("cardinal", "B-noun.quantity");
99          BBN_TO_SST = ImmutableMap.copyOf(bbnToSst);
100 
101         final Map<String, String> synsetToSst = Maps.newHashMap();
102         synsetToSst.put("00007846-n", "B-noun.person");
103         synsetToSst.put("00027167-n", "B-noun.location");
104         synsetToSst.put("00033615-n", "B-noun.quantity");
105         SYNSET_TO_SST = ImmutableMap.copyOf(synsetToSst);
106     }
107 
108     public static Dictionary getDictionary() {
109         synchronized (WordNet.class) {
110             if (dictionary == null) {
111                 JWNL.shutdown(); // in case it was previously initialized
112                 try {
113                     final String properties = Resources.toString(
114                             WordNet.class.getClassLoader().getResource("jwnl.xml"), Charsets.UTF_8).replace(
115                             "DICTIONARY_PATH_PLACEHOLDER", dictionaryPath);
116                     final InputStream stream = new ByteArrayInputStream(
117                             properties.getBytes(Charsets.UTF_8));
118                     JWNL.initialize(stream);
119                     dictionary = Dictionary.getInstance();
120                 } catch (final Throwable ex) {
121                     JWNL.shutdown();
122                     throw new Error("Cannot initialize JWNL using dictionary path '"
123                             + dictionaryPath + "'", ex);
124                 }
125             }
126             return dictionary;
127         }
128     }
129 
130     private static void releaseDictionary() {
131         synchronized (WordNet.class) {
132             dictionary = null;
133             JWNL.shutdown(); // safe to call it multiple times
134         }
135     }
136 
137     private static Synset getSynset(final String id) {
138         final POS pos = POS.getPOSForKey(getPOS(id));
139         final long offset = getOffset(id);
140         try {
141             synchronized (WordNet.class) {
142                 return getDictionary().getSynsetAt(pos, offset);
143             }
144         } catch (final JWNLException ex) {
145             throw new Error(ex);
146         }
147     }
148 
149     // synsetID has the form offset-x, where x is n for nouns, a for adjectives, v for verbs, r
150     // for adverbs
151 
152     public static void init() {
153         getDictionary();
154     }
155 
156     public static List<String> getSynsetsForLemma(String lemma, String pos) {
157         try {
158             synchronized (WordNet.class) {
159                 IndexWord indexWord = getDictionary().lookupIndexWord(POS.getPOSForKey(pos), lemma);
160                 if (indexWord == null) {
161                     return new ArrayList<>();
162                 }
163                 Synset[] synsets = indexWord.getSenses();
164                 ArrayList<String> ret = new ArrayList<>();
165                 for (int i = 0; i < synsets.length; i++) {
166                     Synset synset = synsets[i];
167                     ret.add(getSynsetID(synset.getOffset(), synset.getPOS().getKey()));
168                 }
169 
170                 return ret;
171             }
172         } catch (final JWNLException ex) {
173             throw new Error(ex);
174         }
175     }
176 
177     public static String getPath() {
178         synchronized (WordNet.class) {
179             return dictionaryPath;
180         }
181     }
182 
183     public static void setPath(final String dictionaryPath) {
184         Preconditions.checkNotNull(dictionaryPath);
185         synchronized (WordNet.class) {
186             if (!WordNet.dictionaryPath.equals(dictionaryPath)) {
187                 releaseDictionary();
188                 WordNet.dictionaryPath = dictionaryPath;
189             }
190         }
191     }
192 
193     // MANIPULATION OF SYNSET IDS
194 
195     public static String getSynsetID(final long offset, final String pos) {
196         return String.format("%08d-%s", offset, pos);
197     }
198 
199     /**
200      * Return the synset ID starting from a readable format:
201      * <ul>
202      * <li>lemma</li>
203      * <li>"-" (dash)</li>
204      * <li>synset number</li>
205      * <li>POS</li>
206      * </ul>
207      * <p>
208      * For example: look-3v
209      *
210      * @param readableSynsetID an absolute URL giving the base location of the image
211      */
212     @Nullable
213     public static String getSynsetID(@Nullable final String readableSynsetID) {
214         if (readableSynsetID == null) {
215             return null;
216         }
217         try {
218             final int length = readableSynsetID.length();
219             final int offset = readableSynsetID.lastIndexOf('-');
220             final String lemma = readableSynsetID.substring(0, offset);
221             final int index = Integer.parseInt(readableSynsetID.substring(offset + 1, length - 1)) - 1;
222             final POS pos = POS.getPOSForKey(readableSynsetID.substring(length - 1, length));
223             final IndexWord word;
224             synchronized (WordNet.class) {
225                 word = getDictionary().getIndexWord(pos, lemma);
226             }
227             final Synset synset = word.getSenses()[index];
228             return getSynsetID(synset.getOffset(), pos.getKey());
229         } catch (final JWNLException ex) {
230             throw new Error(ex);
231         } catch (final Throwable ex) {
232             throw new IllegalArgumentException("Illegal (readable) synset ID " + readableSynsetID,
233                     ex);
234         }
235     }
236 
237     @Nullable
238     public static String getReadableSynsetID(@Nullable final String synsetID) {
239         if (synsetID == null) {
240             return null;
241         }
242         final Synset synset = getSynset(synsetID);
243         if (synset == null) {
244             throw new IllegalArgumentException("Illegal synset ID " + synsetID);
245         }
246         final String lemma = synset.getWords()[0].getLemma();
247         final POS pos = POS.getPOSForKey(getPOS(synsetID));
248         try {
249             final IndexWord word;
250             synchronized (WordNet.class) {
251                 word = getDictionary().lookupIndexWord(pos, lemma);
252             }
253             final Synset[] senses = word.getSenses();
254             for (int i = 0; i < senses.length; ++i) {
255                 if (senses[i].equals(synset)) {
256                     return lemma + "-" + (i + 1) + pos.getKey();
257                 }
258             }
259             throw new Error("Could not determine sense index for lemma " + lemma + " and synset "
260                     + synsetID);
261         } catch (final JWNLException ex) {
262             throw new Error(ex);
263         }
264         // return synset.getSenseKey(lemma); // TODO
265     }
266 
267     public static String getPOS(final String synsetID) {
268         Preconditions.checkNotNull(synsetID);
269         final int index = synsetID.lastIndexOf('-');
270         if (index == synsetID.length() - 1 || synsetID.isEmpty()) {
271             throw new IllegalArgumentException("Cannot extract POS from '" + synsetID
272                     + "' - invalid string");
273         }
274         return ""
275                 + Character.toLowerCase(index < 0 ? synsetID.charAt(0) : synsetID
276                 .charAt(index + 1));
277     }
278 
279     public static long getOffset(String synsetID) {
280         Preconditions.checkNotNull(synsetID);
281         try {
282             final int index = synsetID.lastIndexOf('-');
283             if (index > 0) {
284                 synsetID = synsetID.substring(0, index);
285             }
286             return Long.parseLong(synsetID);
287         } catch (final Throwable ex) {
288             throw new IllegalArgumentException("Cannot extract offset from '" + synsetID + "'", ex);
289         }
290     }
291 
292     public static Set<String> getLemmas(final String synsetID) {
293         final Set<String> lemmas = Sets.newLinkedHashSet();
294         final Synset synset = getSynset(synsetID);
295         if (synset != null) {
296             for (final Word word : synset.getWords()) {
297                 lemmas.add(word.getLemma());
298             }
299         }
300         return lemmas;
301     }
302 
303     public static Set<String> getGenericSet(final String synsetID,
304             final PointerType... pointerTypes) {
305         final Set<String> ret = Sets.newHashSet();
306         final Synset synset = getSynset(synsetID);
307         if (synset != null) {
308             for (final PointerType pointerType : pointerTypes) {
309                 for (final Pointer pointer : synset.getPointers(pointerType)) {
310                     try {
311                         final Synset target = pointer.getTargetSynset();
312                         ret.add(getSynsetID(target.getOffset(), target.getPOS().getKey()));
313                     } catch (final Throwable ex) {
314                         throw new RuntimeException(ex);
315                     }
316                 }
317             }
318         }
319         return ret;
320     }
321 
322     public static Set<String> getGenericSet(final String synsetID, final boolean recursive,
323             final PointerType... pointerTypes) {
324         if (!recursive) {
325             return getGenericSet(synsetID, pointerTypes);
326         }
327         final Set<String> result = Sets.newHashSet();
328         final List<String> queue = Lists.newArrayList(synsetID);
329         while (!queue.isEmpty()) {
330             final String id = queue.remove(0);
331             if (result.add(id)) {
332                 queue.addAll(getGenericSet(id, pointerTypes));
333             }
334         }
335         return result;
336     }
337 
338     public static Set<String> getHypernyms(final String synsetID) {
339         return getGenericSet(synsetID, PointerType.HYPERNYM);
340     }
341 
342     public static Set<String> getHyponyms(final String synsetID) {
343         return getGenericSet(synsetID, PointerType.HYPONYM);
344     }
345 
346     public static Set<String> getHypernyms(final String synsetID, final boolean recursive) {
347         return getGenericSet(synsetID, recursive, PointerType.HYPERNYM,
348                 PointerType.INSTANCE_HYPERNYM);
349     }
350 
351     public static Set<String> getHyponims(final String synsetID, final boolean recursive) {
352         return getGenericSet(synsetID, recursive, PointerType.HYPONYM,
353                 PointerType.INSTANCES_HYPONYM);
354     }
355 
356     // returns only noun synsets
357     @Nullable
358     public static String mapBBNToSynset(@Nullable final String bbn) {
359         return bbn == null ? null : BBN_TO_SYNSET.get(bbn.trim().toLowerCase());
360     }
361 
362     // works only for noun synset
363     @Nullable
364     public static String mapSynsetToBBN(@Nullable final String synsetID) {
365         final List<String> ids = Lists.newLinkedList();
366         ids.add(synsetID);
367         while (!ids.isEmpty()) {
368             final String id = ids.remove(0);
369             final List<String> bbns = SYNSET_TO_BBN.get(id);
370             if (bbns != null && !bbns.isEmpty()) {
371                 return bbns.get(0); // return only first BBN in case of ambiguity
372             }
373             try {
374                 final Synset source = getSynset(id);
375                 final List<String> hypernymIDs = Lists.newArrayList();
376                 for (final PointerType type : new PointerType[] { PointerType.HYPERNYM,
377                         PointerType.INSTANCE_HYPERNYM }) {
378                     for (final Pointer pointer : source.getPointers(type)) {
379                         final Synset target = pointer.getTargetSynset();
380                         hypernymIDs.add(getSynsetID(target.getOffset(), target.getPOS().getKey()));
381                     }
382                 }
383                 Collections.sort(hypernymIDs); // necessary in order to get deterministic results
384                 ids.addAll(hypernymIDs);
385             } catch (final JWNLException ex) {
386                 throw new Error("Unexpected exception (!)", ex);
387             }
388         }
389         return null;
390     }
391 
392     @Nullable
393     public static String mapSynsetToSST(@Nullable final String synsetID) {
394         if (synsetID != null) {
395             final String sst = SYNSET_TO_SST.get(synsetID);
396             if (sst != null) {
397                 return sst;
398             }
399             return "B-" + getSynset(synsetID).getLexFileName();
400         }
401         return null;
402     }
403 
404     // returns always B-noun.XXX sst
405     @Nullable
406     public static String mapBBNToSST(@Nullable final String bbn) {
407         if (bbn != null) {
408             return BBN_TO_SST.get(bbn.trim().toLowerCase());
409         }
410         return null;
411     }
412 }