1   package eu.fbk.dkm.premon.premonitor;
2   
3   import com.google.common.base.Joiner;
4   import com.google.common.base.Preconditions;
5   import com.google.common.collect.*;
6   import com.google.common.io.Files;
7   import eu.fbk.dkm.premon.vocab.FB;
8   import eu.fbk.dkm.premon.vocab.ONTOLEX;
9   import eu.fbk.dkm.premon.vocab.PM;
10  import eu.fbk.rdfpro.*;
11  import eu.fbk.rdfpro.util.Statements;
12  import org.openrdf.model.*;
13  import org.openrdf.model.vocabulary.*;
14  import org.openrdf.rio.RDFHandler;
15  import org.openrdf.rio.RDFHandlerException;
16  import org.openrdf.rio.Rio;
17  
18  import java.io.File;
19  import java.io.IOException;
20  import java.util.*;
21  
22  public class FramebaseCleanerConverter extends Converter {
23  
24      private static final Set<String> POS_TAGS = ImmutableSet.of("a", "adv", "art", "c", "intj",
25              "n", "num", "prep", "scon", "v");
26  
27      private static final ValueFactory VF = Statements.VALUE_FACTORY;
28  
29      private static final Ordering<Value> ORDERING = Ordering.from(Statements.valueComparator());
30  
31      private static final Ordering<URI> MICROFRAME_ORDERING = new Ordering<URI>() {
32  
33          @Override
34          public int compare(final URI left, final URI right) {
35              final String s1 = left.stringValue();
36              final String s2 = right.stringValue();
37              final boolean wn1 = s1.contains("-wn_");
38              final boolean wn2 = s2.contains("-wn_");
39              if (wn1 && wn2) {
40                  return s1.compareTo(s2);
41              } else if (wn1 || wn2) {
42                  return wn1 ? 1 : -1;
43              }
44              final String t1 = s1.substring(s1.lastIndexOf('.') + 1);
45              final String t2 = s2.substring(s2.lastIndexOf('.') + 1);
46              if (!t1.equals(t2)) {
47                  for (final String t : new String[] { "v", "n", "a", "adv", "c", "scon", "art",
48                          "intj" }) {
49                      if (t1.endsWith(t)) {
50                          return -1;
51                      } else if (t2.endsWith(t)) {
52                          return 1;
53                      }
54                  }
55              }
56              int result = s1.length() - s2.length();
57              if (result == 0) {
58                  result = s1.compareTo(s2);
59              }
60              return result;
61          }
62  
63      };
64  
65      public FramebaseCleanerConverter(final File path, final RDFHandler sink, final Properties properties,
66              Map<String, URI> wnInfo) {
67          super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
68      }
69  
70      @Override
71      public void convert() throws IOException, RDFHandlerException {
72  
73          // Identify schema files (RDF) and synset mapping files (.txt) in the source folder
74          final List<String> synsetFiles = Lists.newArrayList();
75          final List<String> schemaFiles = Lists.newArrayList();
76          for (final File file : Files.fileTreeTraverser().preOrderTraversal(this.path)) {
77              if (Rio.getParserFormatForFileName(file.getName()) != null) {
78                  schemaFiles.add(file.getAbsolutePath());
79              } else if (file.getName().endsWith(".txt")) {
80                  synsetFiles.add(file.getAbsolutePath());
81              }
82          }
83  
84          // Read the input once
85          final RDFSource source = RDFSources.read(true, false, null, null,
86                  schemaFiles.toArray(new String[schemaFiles.size()]));
87          final RDFProcessor p1 = RDFProcessors.rdfs(source, SESAME.NIL, true, false);
88          final RDFProcessor p2 = new RDFProcessor() {
89  
90              @Override
91              public RDFHandler wrap(final RDFHandler handler) {
92                  return new Handler(handler);
93              }
94  
95          };
96          final RDFProcessor p = RDFProcessors.sequence(p1, p2);
97          p.apply(RDFSources.NIL, this.defaultSink, 1);
98      }
99  
100     private static class Handler extends AbstractRDFHandlerWrapper {
101 
102         private Map<URI, URIInfo> uriMap;
103 
104         Handler(final RDFHandler handler) {
105             super(handler);
106         }
107 
108         @Override
109         public void startRDF() throws RDFHandlerException {
110             super.startRDF();
111             this.uriMap = Maps.newHashMap();
112         }
113 
114         @Override
115         public void handleComment(final String comment) throws RDFHandlerException {
116             // ignore
117         }
118 
119         @Override
120         public void handleNamespace(final String prefix, final String uri)
121                 throws RDFHandlerException {
122             // ignore
123         }
124 
125         @Override
126         public synchronized void handleStatement(final Statement stmt) throws RDFHandlerException {
127 
128             final Resource s = stmt.getSubject();
129             final URI p = stmt.getPredicate();
130             final Value o = stmt.getObject();
131 
132             if (p.equals(OWL.EQUIVALENTCLASS) && s instanceof URI && o instanceof URI) {
133                 final URIInfo si = getURIInfo((URI) s);
134                 final URIInfo so = getURIInfo((URI) o);
135                 if (si != so) {
136                     si.merge(so);
137                     for (final URI alias : si.getAliases()) {
138                         this.uriMap.put(alias, si);
139                     }
140                 }
141             } else if (s instanceof URI) {
142                 final URIInfo si = getURIInfo((URI) s);
143                 si.update((URI) s, p, o);
144             }
145         }
146 
147         @Override
148         public void endRDF() throws RDFHandlerException {
149 
150             for (final URI uri : new URI[] { DCTERMS.TYPE, FB.INHERITS_FROM, FB.IS_PERSPECTIVE_OF,
151                     RDFS.LABEL, RDFS.COMMENT }) {
152                 this.handler.handleStatement(VF.createStatement(uri, RDF.TYPE,
153                         OWL.ANNOTATIONPROPERTY));
154             }
155             this.handler.handleStatement(VF.createStatement(ONTOLEX.IS_DENOTED_BY, RDF.TYPE,
156                     OWL.OBJECTPROPERTY));
157 
158             for (final URIInfo info : Ordering.natural().sortedCopy(
159                     ImmutableSet.copyOf(this.uriMap.values()))) {
160                 info.emit(this.handler, this.uriMap);
161             }
162 
163             super.endRDF();
164         }
165 
166         private URIInfo getURIInfo(final URI uri) {
167             URIInfo info = this.uriMap.get(uri);
168             if (info == null) {
169                 info = new URIInfo(uri);
170                 this.uriMap.put(uri, info);
171             }
172             return info;
173         }
174 
175     }
176 
177     private static final class URIInfo implements Comparable<URIInfo> {
178 
179         private URI uri;
180 
181         private Set<URI> aliases;
182 
183         private boolean isFrame;
184 
185         private boolean isMicroframe;
186 
187         private boolean isFE;
188 
189         private Set<String> labels;
190 
191         private Set<String> comments;
192 
193         private Set<URI> synsets;
194 
195         private Set<URI> inheritsFrom;
196 
197         private Set<URI> perspectiveOf;
198 
199         private Set<URI> parents;
200 
201         private Set<URI> domains;
202 
203         private Set<URI> ranges;
204 
205         public URIInfo(final URI uri) {
206             this.uri = uri;
207             this.aliases = ImmutableSet.of(uri);
208             this.isFrame = false;
209             this.isMicroframe = false;
210             this.isFE = false;
211             this.labels = ImmutableSet.of();
212             this.comments = ImmutableSet.of();
213             this.synsets = ImmutableSet.of();
214             this.inheritsFrom = ImmutableSet.of();
215             this.perspectiveOf = ImmutableSet.of();
216             this.parents = ImmutableSet.of();
217             this.domains = ImmutableSet.of();
218             this.ranges = ImmutableSet.of();
219         }
220 
221         public Set<URI> getAliases() {
222             return this.aliases;
223         }
224 
225         public void merge(final URIInfo info) {
226 
227             this.uri = MICROFRAME_ORDERING.min(this.uri, info.uri);
228             this.aliases = setAdd(this.aliases, info.aliases);
229             this.isFrame |= info.isFrame;
230             this.isMicroframe |= info.isMicroframe;
231             this.isFE |= info.isFE;
232             this.labels = setAdd(this.labels, info.labels);
233             this.comments = setAdd(this.comments, info.comments);
234             this.synsets = setAdd(this.synsets, info.synsets);
235             this.inheritsFrom = setAdd(this.inheritsFrom, info.inheritsFrom);
236             this.perspectiveOf = setAdd(this.perspectiveOf, info.perspectiveOf);
237             this.parents = setAdd(this.parents, info.parents);
238             this.domains = setAdd(this.domains, info.domains);
239             this.ranges = setAdd(this.ranges, info.ranges);
240         }
241 
242         public void update(final URI s, final URI p, final Value o) {
243 
244             final boolean isSynsetMicroframe = s.stringValue().contains("-wn_");
245 
246             if (p.equals(RDFS.LABEL)) {
247                 if (!isSynsetMicroframe) {
248                     this.labels = setAdd(this.labels, ((Literal) o).getLabel());
249                 }
250 
251             } else if (p.equals(RDFS.COMMENT)) {
252                 if (!isSynsetMicroframe) {
253                     this.comments = setAdd(this.comments, ((Literal) o).getLabel());
254                 }
255 
256             } else if (p.equals(FB.HAS_SYNSET_NUMBER)) {
257                 // Broken in FrameBase release
258                 // final String l = ((Literal) o).getLabel();
259                 // final String str = s.stringValue();
260                 // final int index = str.indexOf(l) + l.length() - 8;
261                 // final char pos = str.charAt(index - 2);
262                 // this.synsets = setAdd(this.synsets,
263                 //      VF.createURI("http://wordnet-rdf.princeton.edu/wn30/" + l + "-" + pos));
264 
265             } else if (p.equals(RDFS.DOMAIN)) {
266                 this.domains = setAdd(this.domains, toURI(o));
267 
268             } else if (p.equals(RDFS.RANGE)) {
269                 this.ranges = setAdd(this.ranges, toURI(o));
270 
271             } else if (p.equals(RDFS.SUBCLASSOF) || p.equals(RDFS.SUBPROPERTYOF)) {
272                 this.parents = setAdd(this.parents, toURI(o));
273 
274             } else if (p.equals(FB.INHERITS_FROM)) {
275                 this.inheritsFrom = setAdd(this.inheritsFrom, toURI(o));
276 
277             } else if (p.equals(FB.IS_PERSPECTIVE_OF)) {
278                 this.perspectiveOf = setAdd(this.perspectiveOf, toURI(o));
279 
280             } else if (p.equals(RDF.TYPE)) {
281                 if (o.equals(FB.FRAME)) {
282                     this.isFrame = true;
283                 } else if (o.equals(FB.MICROFRAME)) {
284                     this.isMicroframe = true;
285                 } else if (o.equals(FB.FRAME_ELEMENT_PROPERTY)) {
286                     this.isFE = true;
287                 }
288             }
289         }
290 
291         public void emit(final RDFHandler handler, final Map<URI, URIInfo> uriMap)
292                 throws RDFHandlerException {
293 
294             if (this.isMicroframe) {
295                 emit(handler, this.uri, RDF.TYPE, OWL.CLASS);
296                 emit(handler, this.uri, DCTERMS.TYPE, FB.MICROFRAME);
297                 emit(handler, this.uri, DCTERMS.TYPE, FB.FRAME);
298             } else if (this.isFrame) {
299                 emit(handler, this.uri, RDF.TYPE, OWL.CLASS);
300                 emit(handler, this.uri, DCTERMS.TYPE, FB.FRAME);
301             } else if (this.isFE) {
302                 emit(handler, this.uri, RDF.TYPE, OWL.OBJECTPROPERTY);
303                 emit(handler, this.uri, DCTERMS.TYPE, FB.FRAME_ELEMENT_PROPERTY);
304             } else {
305                 return;
306             }
307 
308             if (!this.labels.isEmpty()) {
309                 final Literal l = VF.createLiteral(
310                         Joiner.on(" / ").join(Ordering.natural().sortedCopy(this.labels)), "en");
311                 emit(handler, this.uri, RDFS.LABEL, l);
312             }
313 
314             if (!this.comments.isEmpty()) {
315                 final Literal l = VF.createLiteral(
316                         Joiner.on("\n").join(Ordering.natural().sortedCopy(this.comments)), "en");
317                 emit(handler, this.uri, RDFS.COMMENT, l);
318             }
319 
320             for (final URI uri : ORDERING.sortedCopy(this.synsets)) {
321                 emit(handler, this.uri, ONTOLEX.CONCEPT, uri);
322             }
323 
324             for (final URI uri : filter(this.inheritsFrom, uriMap, false)) {
325                 emit(handler, this.uri, FB.INHERITS_FROM, uri);
326             }
327 
328             for (final URI uri : filter(this.perspectiveOf, uriMap, false)) {
329                 emit(handler, this.uri, FB.IS_PERSPECTIVE_OF, uri);
330             }
331 
332             for (final URI uri : filter(Sets.difference(this.parents, this.aliases), uriMap, true)) {
333                 emit(handler, this.uri, this.isFE ? RDFS.SUBPROPERTYOF : RDFS.SUBCLASSOF, uri);
334             }
335 
336             for (final URI uri : filter(this.domains, uriMap, true)) {
337                 emit(handler, this.uri, RDFS.DOMAIN, uri);
338             }
339 
340             for (final URI uri : filter(this.ranges, uriMap, true)) {
341                 emit(handler, this.uri, RDFS.RANGE, uri);
342             }
343 
344             if (this.isMicroframe) {
345                 for (final URI uri : this.aliases) {
346                     URI entry = null;
347                     final String uriStr = uri.stringValue();
348                     final int index = uriStr.lastIndexOf('.');
349                     final String pos = uriStr.substring(index + 1);
350                     if (POS_TAGS.contains(pos)) {
351                         for (final URI parent : uriMap.get(uri).inheritsFrom) {
352                             final String parentStr = parent.stringValue();
353                             if (uriStr.startsWith(parentStr)) {
354                                 final String form = uriStr
355                                         .substring(parentStr.length() + 1, index);
356                                 entry = VF.createURI(PM.NAMESPACE, pos + "-" + form);
357                             }
358                         }
359                         Preconditions.checkArgument(entry != null, uriStr);
360                         emit(handler, this.uri, ONTOLEX.IS_DENOTED_BY, entry);
361                     }
362                 }
363             }
364         }
365 
366         @Override
367         public int compareTo(final URIInfo other) {
368             if (this.isFE && other.isFrame) {
369                 return 1;
370             } else if (this.isFrame && other.isFE) {
371                 return -1;
372             } else {
373                 return ORDERING.compare(this.uri, other.uri);
374             }
375         }
376 
377         private void emit(final RDFHandler handler, final Resource s, final URI p, final Value o)
378                 throws RDFHandlerException {
379             handler.handleStatement(Statements.VALUE_FACTORY.createStatement(s, p, o));
380         }
381 
382         private static List<URI> filter(final Iterable<URI> uris, final Map<URI, URIInfo> uriMap,
383                 final boolean removeParents) {
384             final Set<URI> rewrittenURIs = Sets.newHashSet();
385             for (final URI uri : uris) {
386                 final URIInfo info = uriMap.get(uri);
387                 if (info != null) {
388                     rewrittenURIs.add(info.uri);
389                 }
390             }
391             if (removeParents) {
392                 final Set<URI> parents = Sets.newHashSet();
393                 for (final URI uri : rewrittenURIs) {
394                     final URIInfo i = uriMap.get(uri);
395                     for (final URI u : i.parents) {
396                         if (!i.aliases.contains(u)) {
397                             parents.add(u);
398                         }
399                     }
400                 }
401                 rewrittenURIs.removeAll(parents);
402             }
403             return ORDERING.sortedCopy(rewrittenURIs);
404         }
405 
406         private static <T> Set<T> setAdd(Set<T> set, final T element) {
407             if (!(set instanceof HashSet)) {
408                 set = Sets.newHashSet();
409             }
410             set.add(element);
411             return set;
412         }
413 
414         private static <T> Set<T> setAdd(Set<T> set, final Iterable<T> elements) {
415             if (Iterables.isEmpty(elements)) {
416                 return set;
417             }
418             if (!(set instanceof HashSet)) {
419                 set = Sets.newHashSet(set);
420             }
421             Iterables.addAll(set, elements);
422             return set;
423         }
424 
425         private static URI toURI(final Value value) {
426             if (value instanceof URI) {
427                 return (URI) value;
428             }
429             if (value instanceof Literal) {
430                 final String s = ((Literal) value).getLabel();
431                 if (s.startsWith("http://")) {
432                     return VF.createURI(s.trim());
433                 }
434             }
435             throw new IllegalArgumentException("Not a valid URI: " + value);
436         }
437 
438     }
439 
440     @Override protected URI getPosURI(String textualPOS) {
441         return null;
442     }
443 }