1 package eu.fbk.dkm.premon.premonitor;
2
3 import com.google.common.collect.HashMultimap;
4 import com.google.common.collect.Multimap;
5 import com.google.common.io.Files;
6 import eu.fbk.dkm.premon.premonitor.propbank.*;
7 import eu.fbk.dkm.premon.util.NF;
8 import eu.fbk.dkm.utils.CommandLine;
9 import eu.fbk.rdfpro.*;
10 import eu.fbk.rdfpro.util.Algebra;
11 import eu.fbk.rdfpro.util.QuadModel;
12 import org.openrdf.model.Statement;
13 import org.openrdf.model.URI;
14 import org.openrdf.model.Value;
15 import org.openrdf.model.impl.ValueFactoryImpl;
16 import org.openrdf.model.vocabulary.DCTERMS;
17 import org.openrdf.model.vocabulary.OWL;
18 import org.openrdf.model.vocabulary.RDF;
19 import org.openrdf.query.BindingSet;
20 import org.openrdf.query.algebra.TupleExpr;
21 import org.openrdf.rio.RDFHandler;
22 import org.openrdf.rio.RDFHandlerException;
23 import org.slf4j.Logger;
24 import org.slf4j.LoggerFactory;
25
26 import javax.xml.bind.JAXBContext;
27 import javax.xml.bind.Unmarshaller;
28 import java.io.File;
29 import java.util.*;
30 import java.util.regex.Matcher;
31 import java.util.regex.Pattern;
32
33 public class Premonitor_old {
34
35 static class PBfinfo {
36 private String fileName;
37 private String type;
38 private String lemma;
39
40 public String getFileName() {
41 return fileName;
42 }
43
44 public String getType() {
45 return type;
46 }
47
48 public String getLemma() {
49 return lemma;
50 }
51
52 public PBfinfo(String fileName, boolean isOntoNotes) throws Exception {
53 this.fileName = fileName;
54 this.type = "v";
55 this.lemma = fileName.replaceAll("\\.xml", "");
56
57 if (isOntoNotes) {
58 Matcher matcher = ONTONOTES_FILENAME_PATTERN.matcher(fileName);
59 if (matcher.matches()) {
60 this.type = matcher.group(2);
61 this.lemma = matcher.group(1);
62 }
63 else {
64 throw new Exception("File " + fileName + " does not appear to be a good OntoNotes frame file");
65 }
66 }
67
68 }
69 }
70
71
72
73
74
75
76
77
78
79
80
81 private static final Logger LOGGER = LoggerFactory.getLogger(Premonitor_old.class);
82
83 static final String WN_NAMESPACE = "http://wordnet-rdf.princeton.edu/wn31/";
84 static final Pattern ONTONOTES_FILENAME_PATTERN = Pattern.compile("(.*)-([a-z]+)\\.xml");
85 static final Pattern THETA_NAME_PATTERN = Pattern.compile("^([^0-9]+)([0-9]+)$");
86 static final String VN_NAME_REGEXP = "^[^0-9]+-";
87 static final Pattern VN_CODE_PATTERN = Pattern.compile("^[0-9]+(\\.[0-9]+)*(-[0-9]+)*$");
88
89 static final ValueFactoryImpl factory = ValueFactoryImpl.getInstance();
90
91 static final String DEFAULT_LANGUAGE = "en";
92 static final String DEFAULT_NAMESPACE = "http://pb2rdf.org/";
93
94
95 private static HashMap<String, String> bugMap = new HashMap<String, String>();
96
97 static {
98 bugMap.put("@", "2");
99 bugMap.put("av", "adv");
100 bugMap.put("ds", "dis");
101 bugMap.put("a", "agent");
102 bugMap.put("pred", "prd");
103 bugMap.put("o", "0");
104 bugMap.put("emitter of hoot", "0");
105 }
106
107 private static HashMap<String, String> lemmaToTransform = new HashMap();
108
109 static {
110 lemmaToTransform.put("cry+down(e)", "cry+down");
111 }
112
113 private static HashSet<String> fileToDiscard = new HashSet<>();
114
115 static {
116 fileToDiscard.add("except-v.xml");
117 }
118
119 private static HashSet<String> functionTags = new HashSet<String>();
120
121 static {
122 functionTags.add("ext");
123 functionTags.add("loc");
124 functionTags.add("dir");
125 functionTags.add("neg");
126 functionTags.add("mod");
127 functionTags.add("adv");
128 functionTags.add("mnr");
129 functionTags.add("prd");
130 functionTags.add("rec");
131 functionTags.add("tmp");
132 functionTags.add("prp");
133 functionTags.add("pnc");
134 functionTags.add("cau");
135 functionTags.add("adj");
136 functionTags.add("com");
137 functionTags.add("dis");
138 functionTags.add("dsp");
139 functionTags.add("gol");
140 functionTags.add("pag");
141 functionTags.add("ppt");
142 functionTags.add("rcl");
143 functionTags.add("slc");
144 functionTags.add("vsp");
145 functionTags.add("lvb");
146 }
147
148 private static HashMap<String, String> additionalWords = new HashMap<>();
149
150 static {
151 additionalWords.put("through", "prep");
152 additionalWords.put("vent", "n");
153 additionalWords.put("away", "r");
154 additionalWords.put("about", "r");
155 additionalWords.put("back", "r");
156 additionalWords.put("upon", "prep");
157 additionalWords.put("aback", "r");
158 additionalWords.put("down", "r");
159 additionalWords.put("around", "r");
160 additionalWords.put("out", "r");
161 additionalWords.put("hold", "n");
162 additionalWords.put("across", "r");
163 additionalWords.put("along", "r");
164 additionalWords.put("by", "prep");
165 additionalWords.put("rubber", "n");
166 additionalWords.put("up", "prep");
167 additionalWords.put("after", "r");
168 additionalWords.put("hard", "r");
169 additionalWords.put("together", "r");
170 additionalWords.put("on", "r");
171 additionalWords.put("apart", "r");
172 additionalWords.put("over", "r");
173 additionalWords.put("in", "r");
174 additionalWords.put("like", "prep");
175 additionalWords.put("forward", "r");
176 additionalWords.put("tree", "n");
177 additionalWords.put("clear", "s");
178 additionalWords.put("birth", "n");
179 additionalWords.put("it", "pron");
180 additionalWords.put("forth", "r");
181 additionalWords.put("off", "r");
182 additionalWords.put("wrong", "s");
183 additionalWords.put("the", "art");
184 additionalWords.put("aside", "r");
185 additionalWords.put("even", "r");
186 additionalWords.put("loose", "r");
187 additionalWords.put("suit", "n");
188 additionalWords.put("to", "prep");
189 additionalWords.put("rise", "n");
190 }
191
192 private static void addDefinition(Collection<Statement> statements, URI uri, URI definitionURI, String value, String language) {
193 Statement statement;
194 statement = factory.createStatement(definitionURI, RDF.TYPE, LEMON.SENSE_DEFINITION);
195 statements.add(statement);
196 statement = factory.createStatement(uri, LEMON.DEFINITION, definitionURI);
197 statements.add(statement);
198 statement = factory.createStatement(definitionURI, LEMON.VALUE, factory.createLiteral(value, language));
199 statements.add(statement);
200
201 }
202
203 public static void main(String[] args) {
204
205 try {
206 final CommandLine cmd = CommandLine
207 .parser()
208 .withName("./premonitor")
209 .withHeader("Transform a ProbBank instance into RDF")
210 .withOption("i", "input", "input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
211 .withOption("w", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
212 .withOption("l", "lang", String.format("Language for literals, default %s", DEFAULT_LANGUAGE), "ISO-CODE", CommandLine.Type.STRING, true, false, false)
213 .withOption("v", "non-verbs", "Extract also non-verbs (only for OntoNotes)")
214 .withOption("o", "ontonotes", "Specify that this is an OntoNotes version of ProbBank")
215 .withOption("e", "examples", "Extract examples")
216 .withOption(null, "use-wn-lex", "Use WordNet LexicalEntries when available")
217 .withOption("s", "single", "Extract single lemma", "LEMMA", CommandLine.Type.STRING, true, false, false)
218 .withOption(null, "namespace", String.format("Namespace, default %s", DEFAULT_NAMESPACE), "URI", CommandLine.Type.STRING, true, false, false)
219 .withOption(null, "wordnet", "WordNet RDF triple file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
220 .withOption(null, "framenet", "FrameNet RDF triple file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
221 .withOption(null, "verbnet", "VerbNet RDF triple file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
222 .withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
223
224 File folder = cmd.getOptionValue("input", File.class);
225 File outputFile = cmd.getOptionValue("output", File.class);
226
227 File wnRDF = null;
228 if (cmd.hasOption("wordnet")) {
229 wnRDF = cmd.getOptionValue("wordnet", File.class);
230 }
231 File fnRDF = null;
232 if (cmd.hasOption("framenet")) {
233 fnRDF = cmd.getOptionValue("framenet", File.class);
234 }
235 File vnRDF = null;
236 if (cmd.hasOption("verbnet")) {
237 vnRDF = cmd.getOptionValue("verbnet", File.class);
238 }
239
240 String language = DEFAULT_LANGUAGE;
241 if (cmd.hasOption("lang")) {
242 language = cmd.getOptionValue("lang", String.class);
243 }
244
245 boolean onlyVerbs = !cmd.hasOption("non-verbs");
246 boolean isOntoNotes = cmd.hasOption("ontonotes");
247 boolean extractExamples = cmd.hasOption("examples");
248 boolean useWordNetLEs = cmd.hasOption("use-wn-lex");
249
250 String onlyOne = null;
251 if (cmd.hasOption("single")) {
252 onlyOne = cmd.getOptionValue("single", String.class);
253 }
254
255 String namespace = DEFAULT_NAMESPACE;
256 if (cmd.hasOption("namespace")) {
257 namespace = cmd.getOptionValue("namespace", String.class);
258 }
259
260
261 System.setProperty("javax.xml.accessExternalDTD", "file");
262
263 JAXBContext jaxbContext = JAXBContext.newInstance(Frameset.class);
264 Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
265
266 HashSet<Statement> statements = new HashSet<Statement>();
267 Statement statement;
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310 URI lexiconURI = factory.createURI(namespace, "lexicon");
311 statement = factory.createStatement(lexiconURI, RDF.TYPE, LEMON.LEXICON);
312 statements.add(statement);
313 statement = factory.createStatement(lexiconURI, LEMON.LANGUAGE, factory.createLiteral("en"));
314 statements.add(statement);
315
316 HashSet<String> roleNs = new HashSet<String>();
317 HashSet<String> roleFs = new HashSet<String>();
318
319 HashSet<String> roleSetsToIgnore = new HashSet<String>();
320
321 final HashSet<URI> wnURIs = new HashSet<URI>();
322 if (wnRDF != null) {
323 LOGGER.info("Loading WordNet");
324 RDFSource source = RDFSources.read(true, true, null, null, wnRDF.getAbsolutePath());
325 source.emit(new AbstractRDFHandler() {
326 @Override
327 public void handleStatement(Statement statement) throws RDFHandlerException {
328 if (statement.getPredicate().equals(RDF.TYPE) && statement.getObject().equals(LEMON.LEXICAL_ENTRY)) {
329 if (statement.getSubject() instanceof URI) {
330 synchronized (wnURIs) {
331 wnURIs.add((URI) statement.getSubject());
332 }
333 }
334 }
335 }
336 }, 1);
337 LOGGER.info("Loaded {} URIs", wnURIs.size());
338 }
339
340 Multimap<String, URI> fnFrames = HashMultimap.create();
341 if (fnRDF != null) {
342 LOGGER.info("Loading FrameNet");
343 final QuadModel model = QuadModel.create();
344 RDFSource source = RDFSources.read(true, true, null, null, fnRDF.getAbsolutePath());
345 source.emit(new AbstractRDFHandler() {
346 @Override
347 public void handleStatement(Statement statement) throws RDFHandlerException {
348 if (statement.getObject().equals(LEMON.LEXICAL_SENSE) && statement.getPredicate().equals(RDF.TYPE)) {
349 synchronized (model) {
350 model.add(statement);
351 }
352 }
353 if (statement.getPredicate().equals(PURL.LABEL)) {
354 synchronized (model) {
355 model.add(statement);
356 }
357 }
358 }
359 }, 1);
360 TupleExpr query = Algebra.parseTupleExpr(
361 "SELECT ?s ?l\n" +
362 "WHERE {\n" +
363 "\t?s a <http://lemon-model.net/lemon#LexicalSense> .\n" +
364 "\t?s <http://purl.org/olia/ubyCat.owl#label> ?l\n" +
365 "}",
366 null, null);
367 Iterator<BindingSet> bindingSetIterator = model.evaluate(query, null, null);
368 while (bindingSetIterator.hasNext()) {
369 BindingSet bindings = bindingSetIterator.next();
370 Value fnFrame = bindings.getValue("l");
371 Value fnSense = bindings.getValue("s");
372 if (fnSense instanceof URI) {
373 String stringValue = fnFrame.stringValue().toLowerCase();
374 fnFrames.put(stringValue, (URI) fnSense);
375 }
376 }
377 }
378
379 Multimap<String, URI> vnFrames = HashMultimap.create();
380
381 if (vnRDF != null) {
382 LOGGER.info("Loading VerbNet");
383 final QuadModel model = QuadModel.create();
384 RDFSource source = RDFSources.read(true, true, null, null, vnRDF.getAbsolutePath());
385 source.emit(new AbstractRDFHandler() {
386 @Override
387 public void handleStatement(Statement statement) throws RDFHandlerException {
388 if (statement.getObject().equals(LEMON.LEXICAL_SENSE) && statement.getPredicate().equals(RDF.TYPE)) {
389 synchronized (model) {
390 model.add(statement);
391 }
392 }
393 if (statement.getPredicate().equals(PURL.LABEL)) {
394 synchronized (model) {
395 model.add(statement);
396 }
397 }
398 if (statement.getPredicate().equals(PURL.SEMANTIC_LABEL)) {
399 synchronized (model) {
400 model.add(statement);
401 }
402 }
403 }
404 }, 1);
405
406 TupleExpr query;
407 Iterator<BindingSet> bindingSetIterator;
408
409
410 query = Algebra.parseTupleExpr(
411 "SELECT ?l ?s WHERE {\n" +
412 "\t?s a <http://lemon-model.net/lemon#LexicalSense> .\n" +
413 "\t?s <http://purl.org/olia/ubyCat.owl#semanticLabel> ?b .\n" +
414 "\t?b <http://purl.org/olia/ubyCat.owl#label> ?l\n" +
415 "}",
416 null, null);
417 bindingSetIterator = model.evaluate(query, null, null);
418 while (bindingSetIterator.hasNext()) {
419 BindingSet bindings = bindingSetIterator.next();
420 Value vnFrame = bindings.getValue("l");
421 Value vnSense = bindings.getValue("s");
422 if (vnSense instanceof URI) {
423 String stringValue = vnFrame.stringValue();
424 stringValue = getSenseNumberOnly(stringValue);
425 vnFrames.put(stringValue, (URI) vnSense);
426 }
427 }
428 }
429 for (String vnSense : vnFrames.keySet()) {
430
431 URI vnSenseURI = createVerbNetURIForSense(vnSense, namespace);
432 statement = factory.createStatement(vnSenseURI, RDF.TYPE, LEMON.LEXICAL_SENSE);
433 statements.add(statement);
434
435 for (URI sense : vnFrames.get(vnSense)) {
436 statement = factory.createStatement(sense, LEMON.BROADER, vnSenseURI);
437 statements.add(statement);
438 }
439 }
440
441
442
443 LOGGER.info("Getting list of roles");
444 HashSet<String> thetaRoles = new HashSet<String>();
445 Multimap<String, String> rolesForSense = HashMultimap.create();
446
447 HashSet<String> allExternalTokens = new HashSet<>();
448
449 for (File file : Files.fileTreeTraverser().preOrderTraversal(folder)) {
450
451 if (discardFile(file, onlyVerbs, isOntoNotes)) {
452 continue;
453 }
454
455 PBfinfo fileInfo;
456 try {
457 fileInfo = new PBfinfo(file.getName(), isOntoNotes);
458 } catch (Exception e) {
459 throw e;
460 }
461
462 String fileName = fileInfo.getFileName();
463 String type = fileInfo.getType();
464 String lemmaFromName = fileInfo.getLemma();
465
466 if (fileToDiscard.contains(fileName)) {
467 continue;
468 }
469
470 if (onlyOne != null && !onlyOne.equals(fileInfo.getLemma())) {
471 continue;
472 }
473
474 Frameset frameset = (Frameset) jaxbUnmarshaller.unmarshal(file);
475 List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
476
477 for (Object predicate : noteOrPredicate) {
478 if (predicate instanceof Predicate) {
479
480 String lemma = ((Predicate) predicate).getLemma().replace('_', '+').replace(' ', '+');
481 if (lemmaToTransform.keySet().contains(lemma)) {
482 lemma = lemmaToTransform.get(lemma);
483 }
484 String[] tokens = lemma.split("\\+");
485
486 for (String token : tokens) {
487 if (token.equals(lemmaFromName)) {
488 continue;
489 }
490 allExternalTokens.add(token);
491 }
492
493
494 List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
495 for (Object roleset : noteOrRoleset) {
496 if (roleset instanceof Roleset) {
497
498 List<Object> rolesOrExample = ((Roleset) roleset).getNoteOrRolesOrExampleOrAliases();
499 for (Object roles : rolesOrExample) {
500 if (roles instanceof Roles) {
501
502 int okRoles = 0;
503
504 List<Object> noteOrRole = ((Roles) roles).getNoteOrRole();
505 for (Object role : noteOrRole) {
506 if (role instanceof Role) {
507 String n = ((Role) role).getN();
508 String f = ((Role) role).getF();
509
510 NF nf = new NF(n, f);
511
512
513 String argName = nf.getArgName();
514 if (argName == null) {
515 continue;
516 }
517 if (bugMap.containsKey(argName)) {
518 continue;
519 }
520
521 if (nf.getN() != null) {
522 roleNs.add(nf.getN());
523 okRoles++;
524 }
525 if (nf.getF() != null) {
526 roleFs.add(nf.getF());
527 }
528
529 List<Vnrole> vnroleList = ((Role) role).getVnrole();
530 for (Vnrole vnrole : vnroleList) {
531 if (vnrole.getVntheta() != null && vnrole.getVntheta().trim().length() > 0) {
532 String okRole = getThetaName(vnrole.getVntheta().toLowerCase());
533 thetaRoles.add(okRole);
534
535 String vnSenseString = vnrole.getVncls();
536 HashSet<String> senses = getGoodSensesOnly(vnSenseString);
537 for (String sense : senses) {
538 rolesForSense.put(sense, okRole);
539 }
540 }
541 }
542
543 }
544 }
545
546 if (okRoles == 0) {
547 roleSetsToIgnore.add(((Roleset) roleset).getId());
548 }
549 }
550 }
551 }
552 }
553 }
554 }
555 }
556
557
558
559
560
561 for (String thetaRole : thetaRoles) {
562 URI vnRoleURI = createVerbNetURIForRole(thetaRole, namespace);
563 statement = factory.createStatement(vnRoleURI, RDF.TYPE, PB2RDF.VN_THETA_ROLE_C);
564 statements.add(statement);
565 }
566
567 for (String vnSense : rolesForSense.keySet()) {
568
569 URI vnSenseURI = createVerbNetURIForSense(vnSense, namespace);
570
571
572 statement = factory.createStatement(vnSenseURI, RDF.TYPE, LEMON.LEXICAL_SENSE);
573 statements.add(statement);
574
575 for (String role : rolesForSense.get(vnSense)) {
576 URI vnSenseRoleURI = createVerbNetURIForSenseRole(vnSense, role, namespace);
577 URI vnRoleURI = createVerbNetURIForRole(role, namespace);
578
579 statement = factory.createStatement(vnSenseRoleURI, RDF.TYPE, LEMON.ARGUMENT);
580 statements.add(statement);
581 statement = factory.createStatement(vnSenseRoleURI, LEMON.SEM_ARG, vnSenseURI);
582 statements.add(statement);
583 statement = factory.createStatement(vnSenseRoleURI, PB2RDF.VN_THETA_ROLE, vnRoleURI);
584 statements.add(statement);
585 }
586 }
587
588
589
590
591 HashMap<String, Statement> roleStatements = new HashMap<String, Statement>();
592
593
594 for (String n : roleNs) {
595 try {
596 Integer number = Integer.parseInt(n);
597 roleStatements.put(number.toString(), factory.createStatement(PB2RDF.createRole(number), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
598 } catch (Exception ignored) {
599
600 }
601 }
602
603
604 roleStatements.put(NF.AGENT, factory.createStatement(PB2RDF.createRole(NF.AGENT), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
605 roleStatements.put(NF.MOD, factory.createStatement(PB2RDF.createRole(NF.MOD), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
606
607 for (String functionTag : functionTags) {
608 roleStatements.put(functionTag, factory.createStatement(PB2RDF.createRole(functionTag), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
609 }
610
611
612 for (String f : roleFs) {
613 roleStatements.put(f, factory.createStatement(PB2RDF.createRole(f), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
614 }
615
616 for (String key : roleStatements.keySet()) {
617 statements.add(roleStatements.get(key));
618 }
619
620
621
622 LOGGER.info("Parsing PropBank files");
623 for (File file : Files.fileTreeTraverser().preOrderTraversal(folder)) {
624
625 if (discardFile(file, onlyVerbs, isOntoNotes)) {
626 continue;
627 }
628
629 PBfinfo fileInfo;
630 try {
631 fileInfo = new PBfinfo(file.getName(), isOntoNotes);
632 } catch (Exception e) {
633 throw e;
634 }
635
636 String fileName = fileInfo.getFileName();
637 String type = fileInfo.getType();
638 String lemmaFromName = fileInfo.getLemma();
639
640 if (fileToDiscard.contains(fileName)) {
641 continue;
642 }
643
644 if (onlyOne != null && !onlyOne.equals(lemmaFromName)) {
645 continue;
646 }
647
648 LOGGER.debug("{} ({})", fileName, type);
649
650 Frameset frameset = (Frameset) jaxbUnmarshaller.unmarshal(file);
651 List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
652
653 for (Object predicate : noteOrPredicate) {
654 if (predicate instanceof Predicate) {
655
656 String lemma = ((Predicate) predicate).getLemma().replace('_', '+').replace(' ', '+');
657 if (lemmaToTransform.keySet().contains(lemma)) {
658 lemma = lemmaToTransform.get(lemma);
659 }
660
661 String wnLemma = lemma + "-" + type;
662
663 URI predicateURI = addLexicalEntry(useWordNetLEs, namespace, wnLemma, statements, lexiconURI, language, wnURIs);
664
665 List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
666 for (Object roleset : noteOrRoleset) {
667 if (roleset instanceof Roleset) {
668 String rolesetID = ((Roleset) roleset).getId();
669
670 String[] vnClasses = new String[0];
671 if (((Roleset) roleset).getVncls() != null) {
672 vnClasses = ((Roleset) roleset).getVncls().trim().split("\\s+");
673 }
674
675 String[] fnPredicates = new String[0];
676 if (((Roleset) roleset).getFramnet() != null) {
677 fnPredicates = ((Roleset) roleset).getFramnet().trim().toLowerCase().split("\\s+");
678 }
679
680 if (roleSetsToIgnore.contains(rolesetID)) {
681 continue;
682 }
683
684 String name = ((Roleset) roleset).getName();
685
686 URI senseURI = factory.createURI(namespace, rolesetID);
687
688
689 statement = factory.createStatement(senseURI, RDF.TYPE, LEMON.LEXICAL_SENSE);
690 statements.add(statement);
691 statement = factory.createStatement(senseURI, DCTERMS.SOURCE, factory.createLiteral(fileName));
692 statements.add(statement);
693 statement = factory.createStatement(predicateURI, LEMON.SENSE, senseURI);
694 statements.add(statement);
695
696 for (String vnSense : vnClasses) {
697
698 if (!vnFrames.containsKey(vnSense)) {
699 continue;
700 }
701
702 URI vnSenseURI = createVerbNetURIForSense(vnSense, namespace);
703
704 statement = factory.createStatement(senseURI, LEMON.BROADER, vnSenseURI);
705 statements.add(statement);
706 }
707
708 for (String fnPredicate : fnPredicates) {
709 if (!fnFrames.containsKey(fnPredicate)) {
710 continue;
711 }
712
713 for (URI fnSenseURI : fnFrames.get(fnPredicate)) {
714 statement = factory.createStatement(senseURI, PB2RDF.SIMILAR, fnSenseURI);
715 statements.add(statement);
716 }
717 }
718
719
720 if (name != null && name.length() > 0) {
721 URI definitionURI = factory.createURI(namespace, rolesetID + "_def");
722 addDefinition(statements, senseURI, definitionURI, name, language);
723 }
724
725 List<Object> rolesOrExample = ((Roleset) roleset).getNoteOrRolesOrExampleOrAliases();
726
727 List<Example> examples = new ArrayList<Example>();
728
729 for (Object rOrE : rolesOrExample) {
730 if (rOrE instanceof Roles) {
731 List<Object> noteOrRole = ((Roles) rOrE).getNoteOrRole();
732 for (Object role : noteOrRole) {
733 if (role instanceof Role) {
734 String n = ((Role) role).getN();
735 String f = ((Role) role).getF();
736 String descr = ((Role) role).getDescr();
737 List<Vnrole> vnroleList = ((Role) role).getVnrole();
738
739 NF nf = new NF(n, f);
740 String argName = nf.getArgName();
741 if (argName == null) {
742
743 continue;
744 }
745
746
747 if (bugMap.containsKey(argName)) {
748 argName = bugMap.get(argName);
749 }
750
751 String roleText = rolesetID + "_role-" + nf.getArgName();
752 URI roleURI = factory.createURI(namespace, roleText);
753
754 statement = factory.createStatement(roleURI, RDF.TYPE, LEMON.ARGUMENT);
755 statements.add(statement);
756 statement = factory.createStatement(senseURI, LEMON.SEM_ARG, roleURI);
757 statements.add(statement);
758 try {
759 statement = factory.createStatement(roleURI, PB2RDF.PB_THETA_ROLE, roleStatements.get(argName).getSubject());
760 statements.add(statement);
761 } catch (Exception e) {
762 LOGGER.error(argName + " " + roleText + " " + fileName);
763 }
764
765 if (descr != null && descr.length() > 0) {
766 URI definitionURI = factory.createURI(namespace, roleText + "_def");
767 addDefinition(statements, roleURI, definitionURI, descr, language);
768 }
769
770 for (Vnrole vnrole : vnroleList) {
771 String vnSenseString = vnrole.getVncls();
772 String vnTheta = vnrole.getVntheta();
773
774 HashSet<String> senses = getGoodSensesOnly(vnSenseString);
775
776 if (vnTheta != null && vnTheta.trim().length() > 0) {
777 for (String sense : senses) {
778 URI uri = createVerbNetURIForSenseRole(sense, vnTheta, namespace);
779 statement = factory.createStatement(roleURI, PB2RDF.ARG_SIMILAR, uri);
780 statements.add(statement);
781 }
782 }
783 }
784 }
785 }
786 }
787
788 if (extractExamples) {
789 if (rOrE instanceof Example) {
790 examples.add((Example) rOrE);
791 }
792 }
793 }
794
795
796 int example = 1;
797
798 for (Example rOrE : examples) {
799 String text = null;
800 Inflection inflection = null;
801
802 String exType = rOrE.getType();
803 String exName = rOrE.getName();
804 String exSrc = rOrE.getSrc();
805
806 List<Rel> myRels = new ArrayList<Rel>();
807 List<Arg> myArgs = new ArrayList<Arg>();
808
809 List<Object> exThings = rOrE.getInflectionOrNoteOrTextOrArgOrRel();
810 for (Object thing : exThings) {
811 if (thing instanceof Text) {
812 text = ((Text) thing).getvalue().replaceAll("\\s+", " ").trim();
813 }
814 if (thing instanceof Inflection) {
815 inflection = (Inflection) thing;
816 }
817
818 if (thing instanceof Arg) {
819 myArgs.add((Arg) thing);
820 }
821
822
823 if (thing instanceof Rel) {
824 myRels.add((Rel) thing);
825 }
826 }
827
828 if (text != null && text.length() > 0) {
829
830 String exampleStr = rolesetID + "_ex" + (examples.size() > 1 ? example++ : "");
831 URI exampleURI = factory.createURI(namespace, exampleStr);
832
833 statement = factory.createStatement(exampleURI, RDF.TYPE, LEMON.USAGE_EXAMPLE);
834 statements.add(statement);
835 statement = factory.createStatement(senseURI, LEMON.EXAMPLE, exampleURI);
836 statements.add(statement);
837
838
839 addProperty(statements, exampleURI, PB2RDF.PB_EX_NAME, exName, language);
840 addProperty(statements, exampleURI, PB2RDF.PB_EX_SRC, exSrc, language);
841 addProperty(statements, exampleURI, PB2RDF.PB_EX_TYPE, exType, language);
842 addProperty(statements, exampleURI, LEMON.VALUE, text, language);
843
844 Map<String, List<Arg>> exampleArgs = new HashMap<String, List<Arg>>();
845 for (Arg myArg : myArgs) {
846
847 NF nf = new NF(myArg.getN(), myArg.getF());
848 String argName = nf.getArgName();
849
850 if (argName == null) {
851
852 continue;
853 }
854
855
856 if (bugMap.containsKey(argName)) {
857 argName = bugMap.get(argName);
858 }
859
860 if (!exampleArgs.containsKey(argName)) {
861 exampleArgs.put(argName, new ArrayList<Arg>());
862 }
863 exampleArgs.get(argName).add(myArg);
864 }
865
866 for (Map.Entry<String, List<Arg>> entry : exampleArgs.entrySet()) {
867 String argName = entry.getKey();
868 List<Arg> value = entry.getValue();
869 for (int i = 0; i < value.size(); i++) {
870 Arg myArg = value.get(i);
871 String argValue = myArg.getvalue();
872 if (argValue == null) {
873 throw new Exception("argValue is null");
874 }
875
876 String addendum = "";
877 if (value.size() > 1) {
878 addendum = "_" + (i + 1);
879 }
880
881 URI argURI = factory.createURI(namespace, exampleStr + "_arg-" + argName + addendum);
882
883 statement = factory.createStatement(argURI, RDF.TYPE, PB2RDF.EX_ARG_C);
884 statements.add(statement);
885 statement = factory.createStatement(exampleURI, PB2RDF.PB_EX_ARG, argURI);
886 statements.add(statement);
887 statement = factory.createStatement(argURI, LEMON.VALUE, factory.createLiteral(argValue, language));
888 statements.add(statement);
889 statement = factory.createStatement(argURI, PB2RDF.PB_THETA_ROLE, roleStatements.get(argName).getSubject());
890 statements.add(statement);
891 }
892 }
893
894 for (int i = 0; i < myRels.size(); i++) {
895 Rel myRel = myRels.get(i);
896
897 String addendum = "";
898 if (myRels.size() > 1) {
899 addendum += "_" + (i + 1);
900 }
901
902 NF nf = new NF(null, myRel.getF());
903 String relName = nf.getArgName();
904 String relValue = myRel.getvalue();
905
906 if (relValue == null) {
907 throw new Exception("argValue is null");
908 }
909
910 URI relURI = factory.createURI(namespace, exampleStr + "_rel" + addendum);
911
912 statement = factory.createStatement(relURI, RDF.TYPE, PB2RDF.EX_REL_C);
913 statements.add(statement);
914 statement = factory.createStatement(exampleURI, PB2RDF.PB_EX_REL, relURI);
915 statements.add(statement);
916 statement = factory.createStatement(relURI, LEMON.VALUE, factory.createLiteral(relValue, language));
917 statements.add(statement);
918 if (relName != null) {
919 statement = factory.createStatement(relURI, PB2RDF.PB_THETA_ROLE, roleStatements.get(relName).getSubject());
920 statements.add(statement);
921 }
922 }
923
924 if (inflection != null) {
925 URI inflectionURI = factory.createURI(namespace, exampleStr + "_inflection");
926
927 statement = factory.createStatement(inflectionURI, RDF.TYPE, PB2RDF.INFLECTION_C);
928 statements.add(statement);
929 statement = factory.createStatement(exampleURI, PB2RDF.PB_EX_INFLECTION, inflectionURI);
930 statements.add(statement);
931
932
933 addProperty(statements, inflectionURI, PB2RDF.PB_INF_ASPECT, inflection.getAspect(), language);
934 addProperty(statements, inflectionURI, PB2RDF.PB_INF_FORM, inflection.getForm(), language);
935 addProperty(statements, inflectionURI, PB2RDF.PB_INF_PERSON, inflection.getPerson(), language);
936 addProperty(statements, inflectionURI, PB2RDF.PB_INF_TENSE, inflection.getTense(), language);
937 addProperty(statements, inflectionURI, PB2RDF.PB_INF_VOICE, inflection.getVoice(), language);
938 }
939 }
940
941 }
942
943 }
944 }
945 }
946 }
947 }
948
949 RDFSource source = RDFSources.wrap(statements);
950 try {
951 RDFHandler rdfHandler = RDFHandlers.write(null, 1000, outputFile.getAbsolutePath());
952 RDFProcessors
953 .sequence(RDFProcessors.prefix(null), RDFProcessors.unique(false))
954 .apply(source, rdfHandler, 1);
955 } catch (Exception e) {
956 LOGGER.error("Input/output error, the file {} has not been saved ({})", outputFile.getAbsolutePath(), e.getMessage());
957 throw new RDFHandlerException(e);
958 }
959
960 LOGGER.info("File {} saved", outputFile.getAbsolutePath());
961
962 } catch (Throwable ex) {
963 CommandLine.fail(ex);
964 }
965
966
967 }
968
969 private static URI addLexicalEntry(boolean useWordNetLEs, String namespace, String lemma, Collection<Statement> statements, URI lexiconURI, String language, HashSet<URI> wnURIs) {
970 Statement statement;
971 URI wnURI = factory.createURI(WN_NAMESPACE, lemma);
972
973 URI predicateURI;
974 if (wnURIs.contains(wnURI) && useWordNetLEs) {
975 predicateURI = wnURI;
976 }
977 else {
978 LOGGER.info("Word {} is not in WordNet", lemma);
979 LOGGER.info(wnURI.toString());
980
981
982 String[] tokens = lemma.replaceAll("-[a-z]+$", "").split("\\+");
983 if (tokens.length > 1) {
984
985 }
986
987 predicateURI = factory.createURI(namespace, lemma);
988 statement = factory.createStatement(predicateURI, RDF.TYPE, LEMON.LEXICAL_ENTRY);
989 statements.add(statement);
990 statement = factory.createStatement(lexiconURI, LEMON.ENTRY, predicateURI);
991 statements.add(statement);
992
993
994
995 if (wnURIs.contains(wnURI)) {
996 statement = factory.createStatement(predicateURI, OWL.SAMEAS, wnURI);
997 statements.add(statement);
998
999
1000 }
1001
1002
1003 URI formURI = factory.createURI(namespace, lemma + "_form");
1004 statement = factory.createStatement(predicateURI, LEMON.CANONICAL_FORM, formURI);
1005 statements.add(statement);
1006 statement = factory.createStatement(formURI, RDF.TYPE, LEMON.FORM);
1007 statements.add(statement);
1008 statement = factory.createStatement(formURI, LEMON.WRITTEN_REP, factory.createLiteral(lemma, language));
1009 statements.add(statement);
1010 }
1011
1012 return predicateURI;
1013 }
1014
1015 private static URI createVerbNetURIForRole(String role, String namespace) {
1016 String vnID = "vn_role_" + role;
1017 return factory.createURI(namespace, vnID);
1018 }
1019
1020 private static URI createVerbNetURIForSense(String sense, String namespace) {
1021 String vnID = "vn_" + sense;
1022 return factory.createURI(namespace, vnID);
1023 }
1024
1025 private static URI createVerbNetURIForSenseRole(String sense, String role, String namespace) {
1026 String vnRoleID = "vn_role_" + sense + "_" + role;
1027 return factory.createURI(namespace, vnRoleID);
1028 }
1029
1030 private static String isGoodSense(String sense) {
1031 sense = getSenseNumberOnly(sense);
1032 Matcher matcher = VN_CODE_PATTERN.matcher(sense);
1033 if (!matcher.matches()) {
1034 LOGGER.trace("{} does not pass the match test", sense);
1035 return null;
1036 }
1037
1038 return sense;
1039 }
1040
1041 private static HashSet<String> getGoodSensesOnly(String vnSenseString) {
1042 HashSet<String> ret = new HashSet<String>();
1043
1044 if (vnSenseString != null && vnSenseString.trim().length() > 0) {
1045
1046
1047 if (vnSenseString.equals("29. 5")) {
1048 vnSenseString = "29.5";
1049 }
1050
1051 String[] vnSenses = vnSenseString.split("[\\s,]+");
1052
1053 for (String sense : vnSenses) {
1054 String okSense = isGoodSense(sense);
1055 if (okSense != null) {
1056 ret.add(okSense);
1057 }
1058 }
1059 }
1060
1061 return ret;
1062 }
1063
1064 private static String getSenseNumberOnly(String senseName) {
1065
1066
1067 if (senseName.equals("36.4-136.")) {
1068 senseName = "36.4-1";
1069 }
1070
1071
1072 if (senseName.equals("14-1S")) {
1073 senseName = "14-1";
1074 }
1075
1076
1077 if (senseName.equals("62t")) {
1078 senseName = "62";
1079 }
1080
1081
1082 if (senseName.equals("25.2t")) {
1083 senseName = "25.2";
1084 }
1085
1086 return senseName.replaceAll(VN_NAME_REGEXP, "");
1087 }
1088
1089 private static String getThetaName(String name) {
1090 Matcher matcher = THETA_NAME_PATTERN.matcher(name);
1091 if (matcher.matches()) {
1092 String num = matcher.group(2);
1093 if (num.equals("1")) {
1094 return matcher.group(1);
1095 }
1096 else {
1097 return "co-" + matcher.group(1);
1098 }
1099 }
1100 return name;
1101 }
1102
1103 private static boolean discardFile(File file, boolean onlyVerbs, boolean isOntoNotes) {
1104 if (file.isDirectory()) {
1105 LOGGER.trace("File {} is a directory", file.getName());
1106 return true;
1107 }
1108
1109 if (!file.getAbsolutePath().endsWith(".xml")) {
1110 LOGGER.trace("File {} is not XML", file.getName());
1111 return true;
1112 }
1113
1114 if (onlyVerbs && isOntoNotes) {
1115 if (!file.getAbsolutePath().endsWith("-v.xml")) {
1116 LOGGER.trace("File {} is not a verb", file.getName());
1117 return true;
1118 }
1119 }
1120
1121 return false;
1122 }
1123
1124 private static void addProperty(Collection<Statement> statements, URI uri, URI propertyName, String value, String language) {
1125 if (value != null && value.length() > 0) {
1126 Statement statement = factory.createStatement(uri, propertyName, factory.createLiteral(value, language));
1127 statements.add(statement);
1128 }
1129 }
1130
1131 }