1 package eu.fbk.dkm.premon.premonitor;
2
3 import com.google.common.base.Joiner;
4 import com.google.common.base.Preconditions;
5 import com.google.common.collect.*;
6 import com.google.common.io.Files;
7 import eu.fbk.dkm.premon.vocab.FB;
8 import eu.fbk.dkm.premon.vocab.ONTOLEX;
9 import eu.fbk.dkm.premon.vocab.PM;
10 import eu.fbk.rdfpro.*;
11 import eu.fbk.rdfpro.util.Statements;
12 import org.openrdf.model.*;
13 import org.openrdf.model.vocabulary.*;
14 import org.openrdf.rio.RDFHandler;
15 import org.openrdf.rio.RDFHandlerException;
16 import org.openrdf.rio.Rio;
17
18 import java.io.File;
19 import java.io.IOException;
20 import java.util.*;
21
22 public class FramebaseCleanerConverter extends Converter {
23
24 private static final Set<String> POS_TAGS = ImmutableSet.of("a", "adv", "art", "c", "intj",
25 "n", "num", "prep", "scon", "v");
26
27 private static final ValueFactory VF = Statements.VALUE_FACTORY;
28
29 private static final Ordering<Value> ORDERING = Ordering.from(Statements.valueComparator());
30
31 private static final Ordering<URI> MICROFRAME_ORDERING = new Ordering<URI>() {
32
33 @Override
34 public int compare(final URI left, final URI right) {
35 final String s1 = left.stringValue();
36 final String s2 = right.stringValue();
37 final boolean wn1 = s1.contains("-wn_");
38 final boolean wn2 = s2.contains("-wn_");
39 if (wn1 && wn2) {
40 return s1.compareTo(s2);
41 } else if (wn1 || wn2) {
42 return wn1 ? 1 : -1;
43 }
44 final String t1 = s1.substring(s1.lastIndexOf('.') + 1);
45 final String t2 = s2.substring(s2.lastIndexOf('.') + 1);
46 if (!t1.equals(t2)) {
47 for (final String t : new String[] { "v", "n", "a", "adv", "c", "scon", "art",
48 "intj" }) {
49 if (t1.endsWith(t)) {
50 return -1;
51 } else if (t2.endsWith(t)) {
52 return 1;
53 }
54 }
55 }
56 int result = s1.length() - s2.length();
57 if (result == 0) {
58 result = s1.compareTo(s2);
59 }
60 return result;
61 }
62
63 };
64
65 public FramebaseCleanerConverter(final File path, final RDFHandler sink, final Properties properties,
66 Map<String, URI> wnInfo) {
67 super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
68 }
69
70 @Override
71 public void convert() throws IOException, RDFHandlerException {
72
73
74 final List<String> synsetFiles = Lists.newArrayList();
75 final List<String> schemaFiles = Lists.newArrayList();
76 for (final File file : Files.fileTreeTraverser().preOrderTraversal(this.path)) {
77 if (Rio.getParserFormatForFileName(file.getName()) != null) {
78 schemaFiles.add(file.getAbsolutePath());
79 } else if (file.getName().endsWith(".txt")) {
80 synsetFiles.add(file.getAbsolutePath());
81 }
82 }
83
84
85 final RDFSource source = RDFSources.read(true, false, null, null,
86 schemaFiles.toArray(new String[schemaFiles.size()]));
87 final RDFProcessor p1 = RDFProcessors.rdfs(source, SESAME.NIL, true, false);
88 final RDFProcessor p2 = new RDFProcessor() {
89
90 @Override
91 public RDFHandler wrap(final RDFHandler handler) {
92 return new Handler(handler);
93 }
94
95 };
96 final RDFProcessor p = RDFProcessors.sequence(p1, p2);
97 p.apply(RDFSources.NIL, this.defaultSink, 1);
98 }
99
100 private static class Handler extends AbstractRDFHandlerWrapper {
101
102 private Map<URI, URIInfo> uriMap;
103
104 Handler(final RDFHandler handler) {
105 super(handler);
106 }
107
108 @Override
109 public void startRDF() throws RDFHandlerException {
110 super.startRDF();
111 this.uriMap = Maps.newHashMap();
112 }
113
114 @Override
115 public void handleComment(final String comment) throws RDFHandlerException {
116
117 }
118
119 @Override
120 public void handleNamespace(final String prefix, final String uri)
121 throws RDFHandlerException {
122
123 }
124
125 @Override
126 public synchronized void handleStatement(final Statement stmt) throws RDFHandlerException {
127
128 final Resource s = stmt.getSubject();
129 final URI p = stmt.getPredicate();
130 final Value o = stmt.getObject();
131
132 if (p.equals(OWL.EQUIVALENTCLASS) && s instanceof URI && o instanceof URI) {
133 final URIInfo si = getURIInfo((URI) s);
134 final URIInfo so = getURIInfo((URI) o);
135 if (si != so) {
136 si.merge(so);
137 for (final URI alias : si.getAliases()) {
138 this.uriMap.put(alias, si);
139 }
140 }
141 } else if (s instanceof URI) {
142 final URIInfo si = getURIInfo((URI) s);
143 si.update((URI) s, p, o);
144 }
145 }
146
147 @Override
148 public void endRDF() throws RDFHandlerException {
149
150 for (final URI uri : new URI[] { DCTERMS.TYPE, FB.INHERITS_FROM, FB.IS_PERSPECTIVE_OF,
151 RDFS.LABEL, RDFS.COMMENT }) {
152 this.handler.handleStatement(VF.createStatement(uri, RDF.TYPE,
153 OWL.ANNOTATIONPROPERTY));
154 }
155 this.handler.handleStatement(VF.createStatement(ONTOLEX.IS_DENOTED_BY, RDF.TYPE,
156 OWL.OBJECTPROPERTY));
157
158 for (final URIInfo info : Ordering.natural().sortedCopy(
159 ImmutableSet.copyOf(this.uriMap.values()))) {
160 info.emit(this.handler, this.uriMap);
161 }
162
163 super.endRDF();
164 }
165
166 private URIInfo getURIInfo(final URI uri) {
167 URIInfo info = this.uriMap.get(uri);
168 if (info == null) {
169 info = new URIInfo(uri);
170 this.uriMap.put(uri, info);
171 }
172 return info;
173 }
174
175 }
176
177 private static final class URIInfo implements Comparable<URIInfo> {
178
179 private URI uri;
180
181 private Set<URI> aliases;
182
183 private boolean isFrame;
184
185 private boolean isMicroframe;
186
187 private boolean isFE;
188
189 private Set<String> labels;
190
191 private Set<String> comments;
192
193 private Set<URI> synsets;
194
195 private Set<URI> inheritsFrom;
196
197 private Set<URI> perspectiveOf;
198
199 private Set<URI> parents;
200
201 private Set<URI> domains;
202
203 private Set<URI> ranges;
204
205 public URIInfo(final URI uri) {
206 this.uri = uri;
207 this.aliases = ImmutableSet.of(uri);
208 this.isFrame = false;
209 this.isMicroframe = false;
210 this.isFE = false;
211 this.labels = ImmutableSet.of();
212 this.comments = ImmutableSet.of();
213 this.synsets = ImmutableSet.of();
214 this.inheritsFrom = ImmutableSet.of();
215 this.perspectiveOf = ImmutableSet.of();
216 this.parents = ImmutableSet.of();
217 this.domains = ImmutableSet.of();
218 this.ranges = ImmutableSet.of();
219 }
220
221 public Set<URI> getAliases() {
222 return this.aliases;
223 }
224
225 public void merge(final URIInfo info) {
226
227 this.uri = MICROFRAME_ORDERING.min(this.uri, info.uri);
228 this.aliases = setAdd(this.aliases, info.aliases);
229 this.isFrame |= info.isFrame;
230 this.isMicroframe |= info.isMicroframe;
231 this.isFE |= info.isFE;
232 this.labels = setAdd(this.labels, info.labels);
233 this.comments = setAdd(this.comments, info.comments);
234 this.synsets = setAdd(this.synsets, info.synsets);
235 this.inheritsFrom = setAdd(this.inheritsFrom, info.inheritsFrom);
236 this.perspectiveOf = setAdd(this.perspectiveOf, info.perspectiveOf);
237 this.parents = setAdd(this.parents, info.parents);
238 this.domains = setAdd(this.domains, info.domains);
239 this.ranges = setAdd(this.ranges, info.ranges);
240 }
241
242 public void update(final URI s, final URI p, final Value o) {
243
244 final boolean isSynsetMicroframe = s.stringValue().contains("-wn_");
245
246 if (p.equals(RDFS.LABEL)) {
247 if (!isSynsetMicroframe) {
248 this.labels = setAdd(this.labels, ((Literal) o).getLabel());
249 }
250
251 } else if (p.equals(RDFS.COMMENT)) {
252 if (!isSynsetMicroframe) {
253 this.comments = setAdd(this.comments, ((Literal) o).getLabel());
254 }
255
256 } else if (p.equals(FB.HAS_SYNSET_NUMBER)) {
257
258
259
260
261
262
263
264
265 } else if (p.equals(RDFS.DOMAIN)) {
266 this.domains = setAdd(this.domains, toURI(o));
267
268 } else if (p.equals(RDFS.RANGE)) {
269 this.ranges = setAdd(this.ranges, toURI(o));
270
271 } else if (p.equals(RDFS.SUBCLASSOF) || p.equals(RDFS.SUBPROPERTYOF)) {
272 this.parents = setAdd(this.parents, toURI(o));
273
274 } else if (p.equals(FB.INHERITS_FROM)) {
275 this.inheritsFrom = setAdd(this.inheritsFrom, toURI(o));
276
277 } else if (p.equals(FB.IS_PERSPECTIVE_OF)) {
278 this.perspectiveOf = setAdd(this.perspectiveOf, toURI(o));
279
280 } else if (p.equals(RDF.TYPE)) {
281 if (o.equals(FB.FRAME)) {
282 this.isFrame = true;
283 } else if (o.equals(FB.MICROFRAME)) {
284 this.isMicroframe = true;
285 } else if (o.equals(FB.FRAME_ELEMENT_PROPERTY)) {
286 this.isFE = true;
287 }
288 }
289 }
290
291 public void emit(final RDFHandler handler, final Map<URI, URIInfo> uriMap)
292 throws RDFHandlerException {
293
294 if (this.isMicroframe) {
295 emit(handler, this.uri, RDF.TYPE, OWL.CLASS);
296 emit(handler, this.uri, DCTERMS.TYPE, FB.MICROFRAME);
297 emit(handler, this.uri, DCTERMS.TYPE, FB.FRAME);
298 } else if (this.isFrame) {
299 emit(handler, this.uri, RDF.TYPE, OWL.CLASS);
300 emit(handler, this.uri, DCTERMS.TYPE, FB.FRAME);
301 } else if (this.isFE) {
302 emit(handler, this.uri, RDF.TYPE, OWL.OBJECTPROPERTY);
303 emit(handler, this.uri, DCTERMS.TYPE, FB.FRAME_ELEMENT_PROPERTY);
304 } else {
305 return;
306 }
307
308 if (!this.labels.isEmpty()) {
309 final Literal l = VF.createLiteral(
310 Joiner.on(" / ").join(Ordering.natural().sortedCopy(this.labels)), "en");
311 emit(handler, this.uri, RDFS.LABEL, l);
312 }
313
314 if (!this.comments.isEmpty()) {
315 final Literal l = VF.createLiteral(
316 Joiner.on("\n").join(Ordering.natural().sortedCopy(this.comments)), "en");
317 emit(handler, this.uri, RDFS.COMMENT, l);
318 }
319
320 for (final URI uri : ORDERING.sortedCopy(this.synsets)) {
321 emit(handler, this.uri, ONTOLEX.CONCEPT, uri);
322 }
323
324 for (final URI uri : filter(this.inheritsFrom, uriMap, false)) {
325 emit(handler, this.uri, FB.INHERITS_FROM, uri);
326 }
327
328 for (final URI uri : filter(this.perspectiveOf, uriMap, false)) {
329 emit(handler, this.uri, FB.IS_PERSPECTIVE_OF, uri);
330 }
331
332 for (final URI uri : filter(Sets.difference(this.parents, this.aliases), uriMap, true)) {
333 emit(handler, this.uri, this.isFE ? RDFS.SUBPROPERTYOF : RDFS.SUBCLASSOF, uri);
334 }
335
336 for (final URI uri : filter(this.domains, uriMap, true)) {
337 emit(handler, this.uri, RDFS.DOMAIN, uri);
338 }
339
340 for (final URI uri : filter(this.ranges, uriMap, true)) {
341 emit(handler, this.uri, RDFS.RANGE, uri);
342 }
343
344 if (this.isMicroframe) {
345 for (final URI uri : this.aliases) {
346 URI entry = null;
347 final String uriStr = uri.stringValue();
348 final int index = uriStr.lastIndexOf('.');
349 final String pos = uriStr.substring(index + 1);
350 if (POS_TAGS.contains(pos)) {
351 for (final URI parent : uriMap.get(uri).inheritsFrom) {
352 final String parentStr = parent.stringValue();
353 if (uriStr.startsWith(parentStr)) {
354 final String form = uriStr
355 .substring(parentStr.length() + 1, index);
356 entry = VF.createURI(PM.NAMESPACE, pos + "-" + form);
357 }
358 }
359 Preconditions.checkArgument(entry != null, uriStr);
360 emit(handler, this.uri, ONTOLEX.IS_DENOTED_BY, entry);
361 }
362 }
363 }
364 }
365
366 @Override
367 public int compareTo(final URIInfo other) {
368 if (this.isFE && other.isFrame) {
369 return 1;
370 } else if (this.isFrame && other.isFE) {
371 return -1;
372 } else {
373 return ORDERING.compare(this.uri, other.uri);
374 }
375 }
376
377 private void emit(final RDFHandler handler, final Resource s, final URI p, final Value o)
378 throws RDFHandlerException {
379 handler.handleStatement(Statements.VALUE_FACTORY.createStatement(s, p, o));
380 }
381
382 private static List<URI> filter(final Iterable<URI> uris, final Map<URI, URIInfo> uriMap,
383 final boolean removeParents) {
384 final Set<URI> rewrittenURIs = Sets.newHashSet();
385 for (final URI uri : uris) {
386 final URIInfo info = uriMap.get(uri);
387 if (info != null) {
388 rewrittenURIs.add(info.uri);
389 }
390 }
391 if (removeParents) {
392 final Set<URI> parents = Sets.newHashSet();
393 for (final URI uri : rewrittenURIs) {
394 final URIInfo i = uriMap.get(uri);
395 for (final URI u : i.parents) {
396 if (!i.aliases.contains(u)) {
397 parents.add(u);
398 }
399 }
400 }
401 rewrittenURIs.removeAll(parents);
402 }
403 return ORDERING.sortedCopy(rewrittenURIs);
404 }
405
406 private static <T> Set<T> setAdd(Set<T> set, final T element) {
407 if (!(set instanceof HashSet)) {
408 set = Sets.newHashSet();
409 }
410 set.add(element);
411 return set;
412 }
413
414 private static <T> Set<T> setAdd(Set<T> set, final Iterable<T> elements) {
415 if (Iterables.isEmpty(elements)) {
416 return set;
417 }
418 if (!(set instanceof HashSet)) {
419 set = Sets.newHashSet(set);
420 }
421 Iterables.addAll(set, elements);
422 return set;
423 }
424
425 private static URI toURI(final Value value) {
426 if (value instanceof URI) {
427 return (URI) value;
428 }
429 if (value instanceof Literal) {
430 final String s = ((Literal) value).getLabel();
431 if (s.startsWith("http://")) {
432 return VF.createURI(s.trim());
433 }
434 }
435 throw new IllegalArgumentException("Not a valid URI: " + value);
436 }
437
438 }
439
440 @Override protected URI getPosURI(String textualPOS) {
441 return null;
442 }
443 }