1 package eu.fbk.dkm.premon.premonitor;
2
3 import java.io.File;
4 import java.io.FileInputStream;
5 import java.lang.reflect.Constructor;
6 import java.net.URL;
7 import java.nio.file.Files;
8 import java.util.Collection;
9 import java.util.HashMap;
10 import java.util.List;
11 import java.util.Map;
12 import java.util.Properties;
13 import java.util.Set;
14 import java.util.concurrent.atomic.AtomicInteger;
15 import java.util.regex.Matcher;
16 import java.util.regex.Pattern;
17 import java.util.stream.Collectors;
18
19 import javax.annotation.Nullable;
20
21 import com.google.common.base.Charsets;
22 import com.google.common.base.Joiner;
23 import com.google.common.base.MoreObjects;
24 import com.google.common.base.Preconditions;
25 import com.google.common.collect.HashBasedTable;
26 import com.google.common.collect.HashMultimap;
27 import com.google.common.collect.ImmutableList;
28 import com.google.common.collect.ImmutableMap;
29 import com.google.common.collect.ImmutableSet;
30 import com.google.common.collect.Iterables;
31 import com.google.common.collect.Lists;
32 import com.google.common.collect.Maps;
33 import com.google.common.collect.Multimap;
34 import com.google.common.collect.Multimaps;
35 import com.google.common.collect.Ordering;
36 import com.google.common.collect.Sets;
37 import com.google.common.collect.Table;
38 import com.google.common.collect.Table.Cell;
39 import com.google.common.io.Resources;
40
41 import org.openrdf.model.BNode;
42 import org.openrdf.model.Namespace;
43 import org.openrdf.model.Resource;
44 import org.openrdf.model.Statement;
45 import org.openrdf.model.URI;
46 import org.openrdf.model.Value;
47 import org.openrdf.model.impl.ContextStatementImpl;
48 import org.openrdf.model.impl.URIImpl;
49 import org.openrdf.model.vocabulary.DCTERMS;
50 import org.openrdf.model.vocabulary.OWL;
51 import org.openrdf.model.vocabulary.RDF;
52 import org.openrdf.model.vocabulary.RDFS;
53 import org.openrdf.rio.RDFHandler;
54 import org.openrdf.rio.RDFHandlerException;
55 import org.slf4j.Logger;
56 import org.slf4j.LoggerFactory;
57
58 import eu.fbk.dkm.premon.util.ProcessorUndoRDFS;
59 import eu.fbk.dkm.premon.vocab.DECOMP;
60 import eu.fbk.dkm.premon.vocab.FB;
61 import eu.fbk.dkm.premon.vocab.LEXINFO;
62 import eu.fbk.dkm.premon.vocab.NIF;
63 import eu.fbk.dkm.premon.vocab.ONTOLEX;
64 import eu.fbk.dkm.premon.vocab.PM;
65 import eu.fbk.dkm.premon.vocab.PMO;
66 import eu.fbk.dkm.premon.vocab.PMONB;
67 import eu.fbk.dkm.premon.vocab.PMOPB;
68 import eu.fbk.dkm.utils.CommandLine;
69 import eu.fbk.rdfpro.AbstractRDFHandler;
70 import eu.fbk.rdfpro.RDFHandlers;
71 import eu.fbk.rdfpro.RDFProcessor;
72 import eu.fbk.rdfpro.RDFProcessors;
73 import eu.fbk.rdfpro.RDFSource;
74 import eu.fbk.rdfpro.RDFSources;
75 import eu.fbk.rdfpro.RuleEngine;
76 import eu.fbk.rdfpro.Ruleset;
77 import eu.fbk.rdfpro.SetOperator;
78 import eu.fbk.rdfpro.util.Hash;
79 import eu.fbk.rdfpro.util.IO;
80 import eu.fbk.rdfpro.util.QuadModel;
81 import eu.fbk.rdfpro.util.Statements;
82 import eu.fbk.rdfpro.util.Tracker;
83
84
85
86
87 public class Premonitor {
88
89 private static final String DEFAULT_PATH = ".";
90 private static final String DEFAULT_PROPERTIES_FILE = "premonitor.properties";
91 private static final String DEFAULT_OUTPUT_BASE = "output/premon";
92 private static final String DEFAULT_OUTPUT_FORMATS = "trig.gz,tql.gz,ttl.gz";
93 private static final String DEFAULT_WORDNET_FILE = "wordnet-3.1/wn31.nt.gz";
94
95 private static final Pattern PROPERTIES_RESOURCES_PATTERN = Pattern
96 .compile("^resource([0-9]+)\\.(.*)$");
97
98 private static final String WN_PREFIX = "http://wordnet-rdf.princeton.edu/wn31/";
99
100 private static final URI LEMON_LEXICAL_ENTRY = Statements.VALUE_FACTORY
101 .createURI("http://lemon-model.net/lemon#LexicalEntry");
102 private static final URI LEMON_REFERENCE = Statements.VALUE_FACTORY
103 .createURI("http://lemon-model.net/lemon#reference");
104 private static final URI WN_OLD_SENSE = Statements.VALUE_FACTORY
105 .createURI("http://wordnet-rdf.princeton.edu/ontology#old_sense_key");
106
107 private static final Logger LOGGER = LoggerFactory.getLogger(Premonitor.class);
108
109 public static void main(final String[] args) {
110
111 try {
112 final CommandLine cmd = CommandLine.parser().withName("./premonitor")
113 .withHeader("Transform linguistic resources into RDF")
114 .withOption("i", "input",
115 String.format("input folder (default %s)", DEFAULT_PATH), "FOLDER",
116 CommandLine.Type.DIRECTORY_EXISTING, true, false, false)
117 .withOption("b", "output-base", "Output base path/name (default 'premon')",
118 "PATH", CommandLine.Type.FILE, true, false, false)
119 .withOption("f", "output-formats",
120 "Comma-separated list of output formats (default 'tql.gz')", "FMTS",
121 CommandLine.Type.STRING, true, false, false)
122 .withOption("p", "properties",
123 String.format("Property file (default %s)", DEFAULT_PROPERTIES_FILE),
124 "FILE", CommandLine.Type.FILE, true, false, false)
125 .withOption("s", "single", "Extract single lemma (apply to all resources)",
126 "LEMMA", CommandLine.Type.STRING, true, false, false)
127 .withOption(null, "wordnet",
128 String.format("WordNet RDF triple file (default: %s)",
129 DEFAULT_WORDNET_FILE),
130 "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
131 .withOption(null, "wordnet-sensekeys", "WordNet senseKey mapping", "FILE",
132 CommandLine.Type.FILE_EXISTING, true, false, false)
133 .withOption("r", "omit-owl2rl", "Omit OWL2RL reasoning (faster)")
134 .withOption("x", "omit-stats", "Omit generation of statistics (faster)")
135 .withOption("m", "omit-filter-mappings",
136 "Omit filtering illegal mappings "
137 + "referring to non-existing conceptualizations (faster)")
138 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
139
140
141 File inputFolder = new File(DEFAULT_PATH);
142 if (cmd.hasOption("input")) {
143 inputFolder = cmd.getOptionValue("input", File.class);
144 }
145 File propertiesFile = new File(DEFAULT_PROPERTIES_FILE);
146 if (cmd.hasOption("properties")) {
147 propertiesFile = cmd.getOptionValue("properties", File.class);
148 }
149
150 System.setProperty("javax.xml.accessExternalDTD", "file");
151
152
153 final HashMap<String, URI> wnInfo = new HashMap<>();
154
155 final URL resource = ClassLoader.getSystemClassLoader()
156 .getResource("eu/fbk/dkm/premon/premonitor/wn30-senseKeys.tsv");
157 List<String> allLines = null;
158 if (resource != null) {
159 allLines = Resources.readLines(resource, Charsets.UTF_8);
160 }
161
162 if (cmd.hasOption("wordnet-sensekeys")) {
163 allLines = Files.readAllLines(
164 cmd.getOptionValue("wordnet-sensekeys", File.class).toPath());
165 }
166 if (allLines != null) {
167 for (String line : allLines) {
168 line = line.trim();
169 final String[] parts = line.split("\\s+");
170 if (parts.length >= 2) {
171 String senseKey = parts[0];
172 final String synsetID = parts[1];
173 senseKey = senseKey.replaceAll(":[^:]*:[^:]*$", "");
174 wnInfo.put(senseKey, Converter.createURI(WN_PREFIX, synsetID));
175 }
176 }
177 }
178
179 if (cmd.hasOption("wordnet")) {
180 final File wnRDF = cmd.getOptionValue("wordnet", File.class);
181 if (wnRDF != null && wnRDF.exists()) {
182 LOGGER.info("Loading WordNet");
183 final RDFSource source = RDFSources.read(true, true, null, null,
184 wnRDF.getAbsolutePath());
185 source.emit(new AbstractRDFHandler() {
186
187 @Override
188 public void handleStatement(final Statement statement)
189 throws RDFHandlerException {
190
191
192 if (statement.getPredicate().equals(RDF.TYPE)
193 && statement.getObject().equals(LEMON_LEXICAL_ENTRY)) {
194 if (statement.getSubject() instanceof URI) {
195 synchronized (wnInfo) {
196
197 wnInfo.put(statement.getSubject().stringValue(),
198 (URI) statement.getSubject());
199 }
200 }
201 }
202
203
204 if (statement.getPredicate().equals(LEMON_REFERENCE)) {
205 final Resource s = statement.getSubject();
206 final Value o = statement.getObject();
207 if (s instanceof URI && o instanceof URI) {
208 synchronized (wnInfo) {
209
210 final String name = s.stringValue();
211 final int start = name.lastIndexOf('/') + 1;
212 final int end = name.lastIndexOf('-',
213 name.indexOf('#', start));
214 final String lemma = name.substring(start, end)
215 .replace('+', '_');
216 final String key = o.stringValue() + "|" + lemma;
217 final URI oldURI = wnInfo.put(key, (URI) s);
218 Preconditions
219 .checkState(oldURI == null || oldURI.equals(s));
220 }
221 }
222 }
223 }
224 }, 1);
225
226 LOGGER.info("Loaded {} URIs", wnInfo.size());
227 }
228 }
229
230
231 final HashMap<Integer, Properties> multiProperties = new HashMap<>();
232
233 LOGGER.info("Loading properties file: {}", propertiesFile.getAbsolutePath());
234 if (propertiesFile.exists()) {
235 final Properties tmpProp = new Properties();
236 tmpProp.load(new FileInputStream(propertiesFile));
237
238 for (final Object key : tmpProp.keySet()) {
239 final Matcher m = PROPERTIES_RESOURCES_PATTERN.matcher((String) key);
240 if (m.find()) {
241 final Integer id = Integer.parseInt(m.group(1));
242 final String subProperty = m.group(2);
243
244 if (multiProperties.get(id) == null) {
245 multiProperties.put(id, new Properties());
246 }
247
248 multiProperties.get(id).setProperty(subProperty,
249 tmpProp.getProperty((String) key));
250 }
251 }
252 }
253
254 final Map<String, Map<URI, QuadModel>> models = new HashMap<>();
255 for (final Integer id : multiProperties.keySet()) {
256 final Properties properties = multiProperties.get(id);
257
258 final boolean active = properties.getProperty("active", "0").equals("1");
259 if (!active) {
260 LOGGER.info("Resource {} is not active", id);
261 continue;
262 }
263
264 final String source = properties.getProperty("source");
265 if (source == null || source.length() == 0) {
266 LOGGER.error("Resource {} has no source", id);
267 continue;
268 }
269
270 LOGGER.info("Processing {}", properties.getProperty("label"));
271
272
273 final String className = properties.getProperty("class");
274 if (className == null) {
275 LOGGER.error("Resource {} has no class", id);
276 continue;
277 }
278
279
280 String folderName = properties.getProperty("folder");
281 if (folderName == null) {
282 LOGGER.error("Resource {} has no folder", id);
283 continue;
284 }
285 if (!folderName.startsWith(File.separator)) {
286 folderName = inputFolder + File.separator + folderName;
287 }
288 final File folder = new File(folderName);
289 if (!folder.exists()) {
290 LOGGER.error("Folder {} does not exist", folderName);
291 continue;
292 }
293 if (!folder.isDirectory()) {
294 LOGGER.error("Folder {} is not a folder", folderName);
295 continue;
296 }
297
298 try {
299
300 final AtomicInteger numQuads = new AtomicInteger();
301 final Map<String, String> namespaces = Maps.newHashMap();
302 final Map<URI, QuadModel> graphModels = new HashMap<>();
303 models.put(source, graphModels);
304 final RDFHandler handler = new AbstractRDFHandler() {
305
306 @Override
307 public void handleNamespace(final String prefix, final String uri) {
308 namespaces.put(prefix, uri);
309 }
310
311 @Override
312 public synchronized void handleStatement(final Statement stmt) {
313 numQuads.incrementAndGet();
314 URI graph;
315 try {
316 graph = (URI) stmt.getContext();
317 } catch (final ClassCastException ex) {
318 LOGGER.warn("Unexpected non-URI graph: " + stmt.getContext());
319 return;
320 }
321 QuadModel graphModel = graphModels.get(graph);
322 if (graphModel == null) {
323 graphModel = QuadModel.create();
324 graphModels.put(graph, graphModel);
325 }
326 graphModel.add(stmt.getSubject(), stmt.getPredicate(),
327 stmt.getObject());
328 }
329
330 };
331
332
333 final Class<?> cls = Class.forName(className);
334 final Constructor<?> constructor = cls.getConstructor(File.class,
335 RDFHandler.class, Properties.class, Map.class);
336 final Object converter = constructor.newInstance(folder, handler, properties,
337 wnInfo);
338 if (converter instanceof Converter) {
339 ((Converter) converter).convert();
340 }
341
342
343 int numUniqueQuads = 0;
344 for (final QuadModel model : graphModels.values()) {
345 numUniqueQuads += model.size();
346 for (final Map.Entry<String, String> entry : namespaces.entrySet()) {
347 model.setNamespace(entry.getKey(), entry.getValue());
348 }
349 model.setNamespace(PM.PREFIX, PM.NAMESPACE);
350 model.setNamespace(PMO.PREFIX, PMO.NAMESPACE);
351 model.setNamespace(PMOPB.PREFIX, PMOPB.NAMESPACE);
352 model.setNamespace(PMONB.PREFIX, PMONB.NAMESPACE);
353 model.setNamespace(ONTOLEX.PREFIX, ONTOLEX.NAMESPACE);
354 model.setNamespace(DECOMP.PREFIX, DECOMP.NAMESPACE);
355 model.setNamespace(LEXINFO.PREFIX, LEXINFO.NAMESPACE);
356 model.setNamespace(FB.PREFIX, FB.NAMESPACE);
357 }
358
359
360 LOGGER.info("Extracted {} quads ({} before deduplication)", numUniqueQuads,
361 numQuads.get());
362
363 } catch (final ClassNotFoundException e) {
364
365 LOGGER.error("Class {} not found", className);
366 }
367 }
368
369 try {
370
371 final String base = cmd.getOptionValue("b", String.class, DEFAULT_OUTPUT_BASE);
372 final String[] formats = cmd
373 .getOptionValue("f", String.class, DEFAULT_OUTPUT_FORMATS).split(",");
374 for (int i = 0; i < formats.length; ++i) {
375 if (formats[i].charAt(0) == '.') {
376 formats[i] = formats[i].substring(1);
377 }
378 }
379
380
381 final boolean owl2rl = !cmd.hasOption("r");
382 final boolean statistics = !cmd.hasOption("x");
383 final boolean filterMappings = !cmd.hasOption("m");
384
385
386 emit(base, formats, models, owl2rl, statistics, filterMappings);
387
388 } catch (final Exception ex) {
389
390 throw new RDFHandlerException(
391 "IO error, some files might not have been properly saved ("
392 + ex.getMessage() + ")",
393 ex);
394 }
395
396 } catch (final Throwable ex) {
397 CommandLine.fail(ex);
398 }
399 }
400
401 private static void emit(final String base, final String[] formats,
402 final Map<String, Map<URI, QuadModel>> models, final boolean owl2rl,
403 final boolean statistics, final boolean filterMappings) throws RDFHandlerException {
404
405
406 final QuadModel tbox = QuadModel.create();
407 RDFSources
408 .read(false, true, null, null, "classpath:/eu/fbk/dkm/premon/premonitor/tbox.ttl")
409 .emit(RDFHandlers.wrap(tbox), 1);
410 final String semNS = "http://www.ontologydesignpatterns.org/cp/owl/semiotics.owl#";
411 final Set<URI> unwantedConcepts = ImmutableSet.of(RDFS.RESOURCE, NIF.URISCHEME,
412 NIF.RFC5147_STRING, NIF.CSTRING, new URIImpl(semNS + "InformationEntity"),
413 new URIImpl(semNS + "Expression"), new URIImpl(semNS + "Meaning"));
414 for (final Statement stmt : ImmutableList.copyOf(tbox)) {
415 final Resource s = stmt.getSubject();
416 final URI p = stmt.getPredicate();
417 final Value o = stmt.getObject();
418 if (unwantedConcepts.contains(s) || unwantedConcepts.contains(o)
419 || (s.equals(PMO.SEMANTIC_CLASS_MAPPING) || s.equals(PMO.SEMANTIC_ROLE_MAPPING)
420 || s.equals(PMO.CONCEPTUALIZATION_MAPPING))
421 && p.equals(RDFS.SUBCLASSOF) && o instanceof BNode) {
422 tbox.remove(stmt);
423 }
424 }
425 LOGGER.info("TBox loaded - {} quads", tbox.size());
426
427
428 final Ruleset tboxRuleset = Ruleset
429 .fromRDF("classpath:/eu/fbk/dkm/premon/premonitor/ruleset.ttl");
430 RuleEngine.create(tboxRuleset).eval(tbox);
431 LOGGER.info("TBox closed - {} quads", tbox.size());
432
433 if (owl2rl) {
434
435 final Ruleset aboxRuleset = tboxRuleset.getABoxRuleset(tbox);
436 final RuleEngine aboxEngine = RuleEngine.create(aboxRuleset);
437 LOGGER.info("ABox rule engine initialized - {}", aboxEngine);
438
439
440 for (final Map.Entry<String, Map<URI, QuadModel>> entry1 : models.entrySet()) {
441 for (final Map.Entry<URI, QuadModel> entry2 : entry1.getValue().entrySet()) {
442 final int sizeBefore = entry2.getValue().size();
443 aboxEngine.eval(entry2.getValue());
444 for (final Statement stmt : tbox) {
445 entry2.getValue().remove(stmt.getSubject(), stmt.getPredicate(),
446 stmt.getObject());
447 }
448 final int sizeAfter = entry2.getValue().size();
449 LOGGER.info("ABox closed for {}, graph {}: from {} to {} quads",
450 entry1.getKey(), entry2.getKey(), sizeBefore, sizeAfter);
451 }
452 }
453
454
455
456 for (final Map.Entry<String, Map<URI, QuadModel>> entry1 : models.entrySet()) {
457 final String source = entry1.getKey();
458 final Map<URI, QuadModel> sourceModels = entry1.getValue();
459 final QuadModel entriesModel = sourceModels.get(PM.ENTRIES);
460 for (final Map.Entry<URI, QuadModel> entry2 : sourceModels.entrySet()) {
461 final URI graph = entry2.getKey();
462 final boolean isEntries = graph.equals(PM.ENTRIES);
463 final boolean isExamples = isExampleGraph(graph);
464 final QuadModel filteredModel = QuadModel.create();
465 outer: for (final Statement stmt : entry2.getValue()) {
466 if (stmt.getPredicate().getNamespace().equals("sys:")) {
467 continue;
468 } else if (stmt.getPredicate().equals(RDF.TYPE)) {
469 if (stmt.getObject() instanceof BNode) {
470 continue;
471 } else if (stmt.getObject() instanceof URI
472 && ((URI) stmt.getObject()).getNamespace().equals("sys:")) {
473 continue;
474 } else if (isExamples) {
475 for (final QuadModel model : sourceModels.values()) {
476 if (model != entry2.getValue() && model.contains(stmt)) {
477 continue outer;
478 }
479 }
480 } else if (!isEntries) {
481 if (entriesModel != null && entriesModel.contains(stmt)) {
482 continue;
483 }
484 }
485 }
486 filteredModel.add(stmt);
487 }
488 final int sizeBefore = entry2.getValue().size();
489 entry2.setValue(filteredModel);
490 final int sizeAfter = entry2.getValue().size();
491 LOGGER.info("ABox filtered for {}, graph {}: from {} to {} quads", source,
492 entry2.getKey(), sizeBefore, sizeAfter);
493 }
494 }
495 }
496
497
498 for (final Statement stmt : ImmutableList.copyOf(tbox)) {
499 if (stmt.getPredicate().getNamespace().equals("sys:")
500 || stmt.getObject() instanceof URI
501 && ((URI) stmt.getObject()).getNamespace().equals("sys:")) {
502 tbox.remove(stmt);
503 }
504 }
505
506
507 final List<String> sourceKeys = ImmutableList.copyOf(Iterables.concat(models.keySet(),
508 ImmutableList.of("on5", "wn30", "wn31", "ili", "all")));
509 final List<QuadModel> quadModels = models.values().stream()
510 .flatMap(m -> m.values().stream()).collect(Collectors.toList());
511 Map<String, MappingStatistics> msBefore = null;
512 Map<String, MappingStatistics> msAfter = null;
513 if (statistics) {
514 msBefore = Maps.newHashMap();
515 for (final Map.Entry<String, Map<URI, QuadModel>> entry : models.entrySet()) {
516 msBefore.put(entry.getKey(), new MappingStatistics(entry.getValue().values(),
517 sourceKeys, entry.getKey()));
518 }
519 msBefore.put("all", new MappingStatistics(quadModels, ImmutableList.of(), "all"));
520 msAfter = msBefore;
521 }
522
523
524 if (filterMappings) {
525 filterMappings(models);
526 }
527
528
529 if (statistics) {
530 if (filterMappings) {
531 msAfter = Maps.newHashMap();
532 for (final Map.Entry<String, Map<URI, QuadModel>> entry : models.entrySet()) {
533 msAfter.put(entry.getKey(), new MappingStatistics(entry.getValue().values(),
534 sourceKeys, entry.getKey()));
535 }
536 msAfter.put("all", new MappingStatistics(quadModels, ImmutableList.of(), "all"));
537 }
538 LOGGER.info("Resource statistics");
539 LOGGER.info(String.format(" %-10s %-9s %-9s %-9s %-9s %-9s %-9s %-9s %-9s %-9s",
540 "source", "#classes", "#roles", "#conc", "#entries", "#examples", "#annsets",
541 "#classrel", "#rolerel", "#corestmt"));
542 for (final Map.Entry<String, Map<URI, QuadModel>> entry : models.entrySet()) {
543 final String source = entry.getKey();
544 final InstanceStatistics s = new InstanceStatistics(entry.getValue().values(),
545 tbox);
546 LOGGER.info(String.format(" %-10s %-9d %-9d %-9d %-9d %-9d %-9d %-9d %-9d %-9d",
547 source, s.numSemanticClasses, s.numSemanticRoles, s.numConceptualizations,
548 s.numLexicalEntries, s.numExamples, s.numAnnotationSets, s.numClassRels,
549 s.numRoleRels, s.numCoreTriples));
550 }
551 final InstanceStatistics s = new InstanceStatistics(quadModels, tbox);
552 LOGGER.info(String.format(" %-10s %-9d %-9d %-9d %-9d %-9d %-9d %-9d %-9d %-9d",
553 "all", s.numSemanticClasses, s.numSemanticRoles, s.numConceptualizations,
554 s.numLexicalEntries, s.numExamples, s.numAnnotationSets, s.numClassRels,
555 s.numRoleRels, s.numCoreTriples));
556 LOGGER.info("Mapping statistics");
557 LOGGER.info(String.format(" %-32s %-39s %-39s", "sources", "# good mappings",
558 "# invalid mappings"));
559 LOGGER.info(String.format(
560 " %-10s %-10s %-10s %-9s %-9s %-9s %-9s %-9s %-9s %-9s %-9s", "from", "to",
561 "resource", "con", "class", "role", "other", "con", "class", "role", "other"));
562 for (final String from : sourceKeys) {
563 final Integer z = new Integer(0);
564 for (final String to : sourceKeys) {
565 for (final String resource : Iterables.concat(models.keySet(),
566 ImmutableList.of("all"))) {
567 final MappingStatistics ms = msAfter.get(resource);
568 final MappingStatistics msb = msBefore.get(resource);
569 final int nx, nc, nr, no, nxb, ncb, nrb, nob;
570 nx = MoreObjects.firstNonNull(ms.conMappings.get(from, to), z);
571 nc = MoreObjects.firstNonNull(ms.classMappings.get(from, to), z);
572 nr = MoreObjects.firstNonNull(ms.roleMappings.get(from, to), z);
573 no = MoreObjects.firstNonNull(ms.otherMappings.get(from, to), z);
574 nxb = MoreObjects.firstNonNull(msb.conMappings.get(from, to), z);
575 ncb = MoreObjects.firstNonNull(msb.classMappings.get(from, to), z);
576 nrb = MoreObjects.firstNonNull(msb.roleMappings.get(from, to), z);
577 nob = MoreObjects.firstNonNull(msb.otherMappings.get(from, to), z);
578 if (nxb + ncb + nrb + nob > 0) {
579 LOGGER.info(String.format(
580 " %-10s %-10s %-10s %-9d %-9d %-9d %-9d %-9d %-9d %-9d %-9d",
581 from, to, resource, nx, nc, nr, no, nxb - nx, ncb - nc,
582 nrb - nr, nob - no));
583 }
584 }
585 }
586 }
587 }
588
589
590 LOGGER.info("Emitting datasets ...");
591
592
593 emit(base, "tbox", formats, ImmutableMap.of(PM.TBOX, tbox), null, owl2rl, false);
594
595
596 final Multimap<URI, QuadModel> modelsByURI = HashMultimap.create();
597 for (final Map.Entry<String, Map<URI, QuadModel>> entry : models.entrySet()) {
598 final String source = entry.getKey();
599 final Map<URI, QuadModel> graphModels = entry.getValue();
600 emit(base, source, formats, Maps.filterKeys(graphModels, g -> !isExampleGraph(g)),
601 tbox, owl2rl, statistics);
602 emit(base, source + "-examples", formats,
603 Maps.filterKeys(graphModels, g -> isExampleGraph(g)), tbox, owl2rl,
604 statistics);
605 modelsByURI.putAll(Multimaps.forMap(graphModels));
606 }
607
608
609 final Map<URI, QuadModel> mergedGraphModels = Maps.newHashMap();
610 mergedGraphModels.put(PM.TBOX, tbox);
611 for (final Map.Entry<URI, Collection<QuadModel>> entry : modelsByURI.asMap().entrySet()) {
612 if (entry.getValue().size() == 1) {
613 mergedGraphModels.put(entry.getKey(), entry.getValue().iterator().next());
614 } else if (entry.getValue().size() > 1) {
615 final QuadModel mergedModel = QuadModel.create();
616 for (final QuadModel model : entry.getValue()) {
617 for (final Namespace ns : model.getNamespaces()) {
618 mergedModel.setNamespace(ns);
619 }
620 mergedModel.addAll(model);
621 }
622 mergedGraphModels.put(entry.getKey(), mergedModel);
623 }
624 }
625 emit(base, "models", formats, Maps.filterKeys(mergedGraphModels, g -> !isExampleGraph(g)),
626 tbox, owl2rl, statistics);
627 emit(base, "all", formats, mergedGraphModels, tbox, owl2rl, statistics);
628 }
629
630 private static void emit(final String base, final String classifier, final String[] formats,
631 final Map<URI, QuadModel> models, @Nullable final QuadModel tbox, final boolean owl2rl,
632 final boolean statistics) throws RDFHandlerException {
633
634
635 final List<RDFProcessor> processors = Lists.newArrayList();
636 for (final String format : formats) {
637 final String location = base + "-" + classifier + (owl2rl ? "-inf." : ".") + format;
638 processors.add(RDFProcessors.write(null, 1000, location));
639 }
640 processors.add(RDFProcessors.track(new Tracker(LOGGER, null,
641 classifier + (owl2rl ? "-inf" : "") + " - %d quads", null)));
642
643
644 if (statistics) {
645 final List<RDFProcessor> statsProcessors = Lists.newArrayList();
646 statsProcessors.add(RDFProcessors.stats(null, null, null, null, false));
647 for (final String format : formats) {
648 final String location = base + "-" + classifier + "-stats." + format;
649 statsProcessors.add(RDFProcessors.write(null, 1000, location));
650 }
651 statsProcessors.add(RDFProcessors
652 .track(new Tracker(LOGGER, null, classifier + "-stats - %d quads", null)));
653 statsProcessors.add(RDFProcessors.NIL);
654 processors
655 .add(RDFProcessors.parallel(SetOperator.UNION_MULTISET, RDFProcessors.IDENTITY,
656 RDFProcessors.sequence(statsProcessors.toArray(new RDFProcessor[0]))));
657 }
658
659
660 if (owl2rl && tbox != null) {
661 processors.add(new ProcessorUndoRDFS(RDFSources.wrap(tbox)));
662 for (final String format : formats) {
663 final String location = base + "-" + classifier + "-noinf." + format;
664 processors.add(RDFProcessors.write(null, 1000, location));
665 }
666 processors.add(RDFProcessors
667 .track(new Tracker(LOGGER, null, classifier + "-noinf - %d quads", null)));
668 }
669
670
671 final RDFProcessor processor = RDFProcessors
672 .sequence(processors.toArray(new RDFProcessor[processors.size()]));
673
674
675 final RDFHandler handler = RDFHandlers.decouple(processor.wrap(RDFHandlers.NIL));
676 try {
677
678 handler.startRDF();
679
680
681 final Set<Namespace> namespaces = Sets.newHashSet();
682 for (final QuadModel model : models.values()) {
683 namespaces.addAll(model.getNamespaces());
684 }
685 for (final Namespace namespace : Ordering.natural().sortedCopy(namespaces)) {
686 handler.handleNamespace(namespace.getPrefix(), namespace.getName());
687 }
688
689
690 final List<URI> sortedGraphs = Lists.newArrayList();
691 if (models.containsKey(PM.META)) {
692 sortedGraphs.add(PM.META);
693 }
694 if (models.containsKey(PM.ENTRIES)) {
695 sortedGraphs.add(PM.ENTRIES);
696 }
697 for (final URI graph : Ordering.from(Statements.valueComparator())
698 .sortedCopy(models.keySet())) {
699 if (!graph.equals(PM.META) && !graph.equals(PM.ENTRIES)) {
700 sortedGraphs.add(graph);
701 }
702 }
703 for (final URI graph : sortedGraphs) {
704 for (final Statement stmt : models.get(graph)) {
705 handler.handleStatement(new ContextStatementImpl(stmt.getSubject(),
706 stmt.getPredicate(), stmt.getObject(), graph));
707 }
708 }
709 } catch (final Throwable ex) {
710 LOGGER.error("File generation failed", ex);
711
712 } finally {
713
714 handler.endRDF();
715 IO.closeQuietly(handler);
716 }
717 }
718
719 private static void filterMappings(final Map<String, Map<URI, QuadModel>> models) {
720
721 LOGGER.info("Removing illegal mappings...");
722
723 final Set<URI> validItems = Sets.newHashSet();
724 for (final Map<URI, QuadModel> map : models.values()) {
725 for (final QuadModel model : map.values()) {
726 for (final Statement stmt : model.filter(null, PMO.EVOKED_CONCEPT, null)) {
727 validItems.add((URI) stmt.getSubject());
728
729 }
730 for (final Statement stmt : model.filter(null, PMO.SEM_ROLE, null)) {
731 validItems.add((URI) stmt.getSubject());
732 validItems.add((URI) stmt.getObject());
733 }
734 }
735 }
736
737 for (final Map<URI, QuadModel> map : models.values()) {
738 for (final Map.Entry<URI, QuadModel> entry : map.entrySet()) {
739 final QuadModel model = entry.getValue();
740 for (final URI type : new URI[] { PMO.CONCEPTUALIZATION_MAPPING,
741 PMO.SEMANTIC_CLASS_MAPPING, PMO.SEMANTIC_ROLE_MAPPING }) {
742
743 int numMappingsToDelete = 0;
744 int numMappings = 0;
745 int mappingsDeletedCompletely = 0;
746 int referencesRemoved = 0;
747 final Map<String, Integer> numMappingsPerSource = Maps.newHashMap();
748 final List<Statement> stmtsToDelete = Lists.newArrayList();
749 for (final Resource m : model.filter(null, RDF.TYPE, type).subjects()) {
750 ++numMappings;
751 final List<Statement> stmts = ImmutableList
752 .copyOf(model.filter(m, null, null));
753 boolean valid = true;
754 final List<Statement> stmtsInvalid = Lists.newArrayList();
755 for (final Statement stmt : stmts) {
756 if (stmt.getPredicate().equals(PMO.ITEM)
757 && !validItems.contains(stmt.getObject())) {
758
759 ++numMappingsToDelete;
760 final String str = stmt.getObject().stringValue();
761 for (final String source : models.keySet()) {
762 if (str.contains("-" + source + "-")
763 || str.contains("/" + source + "-")) {
764 numMappingsPerSource.put(source,
765 1 + numMappingsPerSource.getOrDefault(source, 0));
766 }
767 }
768
769 if (numMappingsToDelete <= 10) {
770 LOGGER.warn("Removing illegal mapping {} - missing {}", m,
771 stmt.getObject());
772 } else if (LOGGER.isDebugEnabled()) {
773 LOGGER.debug("Removing illegal mapping {} - missing {}", m,
774 stmt.getObject());
775 } else if (numMappingsToDelete == 11) {
776 LOGGER.warn("Omitting further illegal mappings ....");
777 }
778 stmtsInvalid.add(stmt);
779 valid = false;
780 break;
781 }
782 }
783 if (!valid) {
784 int items = 0, itemsInv = 0;
785 for (final Statement stmt : stmts) {
786 if (stmt.getPredicate().equals(PMO.ITEM)) {
787 items++;
788 }
789 }
790 for (final Statement stmt : stmtsInvalid) {
791 if (stmt.getPredicate().equals(PMO.ITEM)) {
792 itemsInv++;
793
794 }
795 }
796 if (items - itemsInv < 2) {
797 stmtsToDelete.addAll(stmts);
798 mappingsDeletedCompletely++;
799 if (numMappingsToDelete <= 10 || LOGGER.isDebugEnabled()) {
800 LOGGER.info("Removing the complete mapping");
801 }
802 } else {
803 stmtsToDelete.addAll(stmtsInvalid);
804 referencesRemoved++;
805 if (numMappingsToDelete <= 10 || LOGGER.isDebugEnabled()) {
806 LOGGER.info("Removing only missing reference");
807 }
808 }
809 }
810 }
811 if (numMappingsToDelete > 0) {
812 for (final Statement stmt : stmtsToDelete) {
813 model.remove(stmt);
814 }
815 LOGGER.warn(
816 "{}/{} illegal {} mappings and {} references {} removed from {}\n############################################################################################################",
817 mappingsDeletedCompletely, numMappings,
818 type.equals(PMO.SEMANTIC_CLASS_MAPPING) ? "semantic class"
819 : type.equals(PMO.CONCEPTUALIZATION_MAPPING)
820 ? "conceptualization" : "semantic role",
821 referencesRemoved, numMappingsPerSource, entry.getKey());
822 }
823 }
824 }
825 }
826
827
828 for (final Map<URI, QuadModel> map : models.values()) {
829 for (final Map.Entry<URI, QuadModel> entry : map.entrySet()) {
830 final QuadModel model = entry.getValue();
831
832 final List<Statement> stmts = ImmutableList
833 .copyOf(model.filter(null, PMO.ONTO_MATCH, null));
834 int numMappingsToDelete = 0;
835 int numTriplesToDelete = 0;
836
837 for (Statement stmt : stmts) {
838 if (!validItems.contains(stmt.getSubject())) {
839 ++numMappingsToDelete;
840
841
842 ++numTriplesToDelete;
843 model.remove(stmt);
844 if (numMappingsToDelete <= 10) {
845 LOGGER.warn("Removing illegal ontoMatch {} - missing {}",
846 stmt.getSubject(), stmt.getObject());
847 } else if (LOGGER.isDebugEnabled()) {
848 LOGGER.debug("Removing illegal ontoMatch {} - missing {}",
849 stmt.getSubject(), stmt.getObject());
850 } else if (numMappingsToDelete == 11) {
851 LOGGER.warn("Omitting further illegal ontoMatch assertions ....");
852 }
853
854
855
856 if (ImmutableList
857 .copyOf(model.filter(null, PMO.ONTO_MATCH, stmt.getObject()))
858 .isEmpty()) {
859
860 final List<Statement> onto_stmts_all = ImmutableList
861 .copyOf(model.filter((URI) stmt.getObject(), null, null));
862 for (final Statement s : onto_stmts_all) {
863
864 ++numTriplesToDelete;
865 model.remove(s);
866 LOGGER.debug("Removing onto triple {} - {} - {}", s.getSubject(),
867 s.getPredicate(), s.getObject());
868 }
869 }
870
871
872
873
874
875 if (!model.contains(stmt.getSubject(), PMO.ONTO_MATCH, null)) {
876
877 for (Statement rel_stmt : ImmutableList
878 .copyOf(model.filter(stmt.getSubject(), null, null))) {
879
880 ++numTriplesToDelete;
881 LOGGER.debug("Removing type triple {} - {} - {}",
882 rel_stmt.getSubject(), rel_stmt.getPredicate(),
883 rel_stmt.getObject());
884 model.remove(stmt);
885
886 }
887 }
888
889 }
890
891 }
892
893 LOGGER.warn(
894 "{} illegal ontoMatch assertions and {} related triples removed from {}\n############################################################################################################",
895 numMappingsToDelete, numTriplesToDelete, entry.getKey());
896
897 }
898 }
899
900 }
901
902 private static boolean isExampleGraph(final URI uri) {
903 return uri.getLocalName().endsWith("-ex");
904 }
905
906 private static final class InstanceStatistics {
907
908 final int numSemanticClasses;
909
910 final int numSemanticRoles;
911
912 final int numConceptualizations;
913
914 final int numLexicalEntries;
915
916 final int numExamples;
917
918 final int numAnnotationSets;
919
920 final int numClassRels;
921
922 final int numRoleRels;
923
924 final int numCoreTriples;
925
926 public InstanceStatistics(final Iterable<? extends QuadModel> models,
927 final QuadModel tbox) {
928
929 final Set<URI> roleRelProperties = Sets.newHashSet();
930 for (final Resource rel : tbox.filter(null, RDFS.SUBPROPERTYOF, PMO.ROLE_REL)
931 .subjects()) {
932 if (rel instanceof URI && !rel.equals(PMO.ROLE_REL)) {
933 roleRelProperties.add((URI) rel);
934 }
935 }
936
937 final Set<Value> classes = Sets.newHashSet();
938 final Set<Value> roles = Sets.newHashSet();
939 final Set<Value> examples = Sets.newHashSet();
940 final Set<Value> annotationSets = Sets.newHashSet();
941 final Set<Statement> classRels = Sets.newHashSet();
942 final Set<Statement> roleRels = Sets.newHashSet();
943 for (final QuadModel model : models) {
944 for (final Resource c : model.filter(null, RDF.TYPE, PMO.SEMANTIC_CLASS)
945 .subjects()) {
946 if (model.contains(null, PMO.EVOKED_CONCEPT, c)
947 || model.contains(c, PMO.CLASS_REL, null)
948 || model.contains(null, PMO.CLASS_REL, c)) {
949 classes.add(c);
950 }
951 }
952 roles.addAll(model.filter(null, PMO.SEM_ROLE, null).objects());
953 examples.addAll(model.filter(null, RDF.TYPE, PMO.EXAMPLE).subjects());
954 annotationSets.addAll(model.filter(null, RDF.TYPE, PMO.ANNOTATION_SET).subjects());
955 classRels.addAll(model.filter(null, PMO.CLASS_REL, null));
956 for (final URI roleRelProperty : roleRelProperties) {
957 roleRels.addAll(model.filter(null, roleRelProperty, null));
958 }
959 }
960 this.numSemanticClasses = classes.size();
961 this.numSemanticRoles = roles.size();
962 this.numExamples = examples.size();
963 this.numAnnotationSets = annotationSets.size();
964 this.numClassRels = classRels.size();
965 this.numRoleRels = roleRels.size();
966
967 final Set<Statement> conceptualizations = Sets.newHashSet();
968 final Set<Value> lexicalEntries = Sets.newHashSet();
969 for (final QuadModel model : models) {
970 for (final Statement stmt : model.filter(null, ONTOLEX.EVOKES, null)) {
971 if (classes.contains(stmt.getObject()) || roles.contains(stmt.getObject())) {
972 conceptualizations.add(stmt);
973 lexicalEntries.add(stmt.getSubject());
974 }
975 }
976 }
977 this.numConceptualizations = conceptualizations.size();
978 this.numLexicalEntries = lexicalEntries.size();
979
980 final Set<Value> coreInstances = Sets.newHashSet();
981 for (final QuadModel model : models) {
982 for (final Statement stmt : model.filter(null, RDF.TYPE, null)) {
983 final Value type = stmt.getObject();
984 if (type.equals(PMO.SEMANTIC_CLASS) || type.equals(PMO.SEMANTIC_ROLE)
985 || type.equals(PMO.CONCEPTUALIZATION) || type.equals(PMO.MAPPING)
986 || type.equals(ONTOLEX.LEXICAL_ENTRY) || type.equals(ONTOLEX.FORM)) {
987 coreInstances.add(stmt.getSubject());
988 }
989 }
990 }
991
992 final Set<Statement> coreStmts = Sets.newHashSet();
993 for (final QuadModel model : models) {
994 for (final Statement stmt : model) {
995 if (coreInstances.contains(stmt.getSubject())
996 || coreInstances.contains(stmt.getObject())) {
997 if (stmt.getPredicate().equals(ONTOLEX.CANONICAL_FORM)
998 || stmt.getPredicate().equals(ONTOLEX.WRITTEN_REP)
999 || stmt.getPredicate().equals(PMO.FIRST)) {
1000 continue;
1001 }
1002 final String ns = stmt.getPredicate().getNamespace();
1003 if (ns.equals(PMO.NAMESPACE) || ns.equals(ONTOLEX.NAMESPACE)
1004 || ns.equals(DECOMP.NAMESPACE) || ns.equals(LEXINFO.NAMESPACE)
1005 || ns.equals(RDFS.NAMESPACE) || ns.equals(OWL.NAMESPACE)
1006 || ns.equals(DCTERMS.NAMESPACE)) {
1007 coreStmts.add(stmt);
1008 }
1009 }
1010 }
1011 }
1012 this.numCoreTriples = coreStmts.size();
1013 }
1014
1015 }
1016
1017 private static final class MappingStatistics {
1018
1019 final Table<String, String, Integer> conMappings;
1020
1021 final Table<String, String, Integer> classMappings;
1022
1023 final Table<String, String, Integer> roleMappings;
1024
1025 final Table<String, String, Integer> otherMappings;
1026
1027 public MappingStatistics(final Iterable<? extends QuadModel> models,
1028 final Iterable<String> sources, final String resource) {
1029
1030 final Table<String, String, Set<Hash>> conHashes = HashBasedTable.create();
1031 final Table<String, String, Set<Hash>> classHashes = HashBasedTable.create();
1032 final Table<String, String, Set<Hash>> roleHashes = HashBasedTable.create();
1033 final Table<String, String, Set<Hash>> otherHashes = HashBasedTable.create();
1034
1035 final List<String> sourceKeys = ImmutableList.copyOf(sources);
1036 final List<Pattern> sourcePatterns = ImmutableList.copyOf(sourceKeys.stream()
1037 .map(s -> Pattern.compile("[-/]" + Pattern.quote(s) + "-")).iterator());
1038
1039 for (final QuadModel model : models) {
1040 for (final Resource mapping : model.filter(null, RDF.TYPE, PMO.MAPPING)
1041 .subjects()) {
1042
1043 final Table<String, String, Set<Hash>> hashes;
1044 if (model.contains(mapping, RDF.TYPE, PMO.CONCEPTUALIZATION_MAPPING)) {
1045 hashes = conHashes;
1046 } else if (model.contains(mapping, RDF.TYPE, PMO.SEMANTIC_CLASS_MAPPING)) {
1047 hashes = classHashes;
1048 } else if (model.contains(mapping, RDF.TYPE, PMO.SEMANTIC_ROLE_MAPPING)) {
1049 hashes = roleHashes;
1050 } else {
1051 hashes = otherHashes;
1052 }
1053
1054 final Map<String, String> items = Maps.newHashMap();
1055 for (final Value item : model.filter(mapping, PMO.ITEM, null).objects()) {
1056 final String str = item.stringValue();
1057 for (int i = 0; i < sourceKeys.size(); ++i) {
1058 if (sourcePatterns.get(i).matcher(str).find()) {
1059 items.put(sourceKeys.get(i), item.stringValue());
1060 }
1061 }
1062 }
1063
1064 for (final String fromSource : items.keySet()) {
1065 for (final String toSource : items.keySet()) {
1066 if (fromSource.compareTo(toSource) < 0) {
1067 addHash(hashes, fromSource, toSource, items.get(fromSource), "|",
1068 items.get(toSource));
1069 }
1070 }
1071 }
1072
1073 addHash(hashes, "all", "all",
1074 Joiner.on('|').join(Ordering.natural().sortedCopy(items.values())));
1075 }
1076
1077 int mappingsCount = model.filter(null, PMO.ONTO_MATCH, null).size();
1078 if (mappingsCount != 0)
1079 LOGGER.debug(
1080 "Processing " + mappingsCount + " for mapping resource " + resource);
1081
1082 for (final Statement mapping : model.filter(null, PMO.ONTO_MATCH, null)) {
1083
1084 final Resource subject = mapping.getSubject();
1085 final Value object = mapping.getObject();
1086
1087 final Table<String, String, Set<Hash>> hashes;
1088
1089 if (model.contains(subject, RDF.TYPE, PMO.CONCEPTUALIZATION)) {
1090 hashes = conHashes;
1091 } else if (model.contains(subject, RDF.TYPE, PMO.SEMANTIC_CLASS)) {
1092 hashes = classHashes;
1093 } else if (model.contains(subject, RDF.TYPE, PMO.SEMANTIC_ROLE)) {
1094 hashes = roleHashes;
1095 } else
1096 hashes = otherHashes;
1097
1098 final String subjStr = subject.stringValue();
1099 String subjRes = "";
1100 for (int i = 0; i < sourceKeys.size(); ++i) {
1101 if (sourcePatterns.get(i).matcher(subjStr).find()) {
1102 subjRes = sourceKeys.get(i);
1103 break;
1104 }
1105 }
1106 final String objStr = object.stringValue();
1107
1108 addHash(hashes, subjRes, resource, subjStr, "|", objStr);
1109 addHash(hashes, "all", "all", subjStr, "|", objStr);
1110 }
1111
1112 }
1113
1114 this.conMappings = countHashes(conHashes);
1115 this.classMappings = countHashes(classHashes);
1116 this.roleMappings = countHashes(roleHashes);
1117 this.otherMappings = countHashes(otherHashes);
1118 }
1119
1120 private static void addHash(final Table<String, String, Set<Hash>> hashes,
1121 final String row, final String col, final String... hashedStrings) {
1122 Set<Hash> set = hashes.get(row, col);
1123 if (set == null) {
1124 set = Sets.newHashSet();
1125 hashes.put(row, col, set);
1126 }
1127 set.add(Hash.murmur3(hashedStrings));
1128 }
1129
1130 private static Table<String, String, Integer> countHashes(
1131 final Table<String, String, Set<Hash>> hashes) {
1132 final Table<String, String, Integer> counts = HashBasedTable.create();
1133 for (final Cell<String, String, Set<Hash>> cell : hashes.cellSet()) {
1134 counts.put(cell.getRowKey(), cell.getColumnKey(), cell.getValue().size());
1135 }
1136 return counts;
1137 }
1138
1139 }
1140
1141 }