1   package eu.fbk.dkm.premon.premonitor;
2   
3   import com.google.common.collect.HashMultimap;
4   import com.google.common.collect.Multimap;
5   import com.google.common.io.Files;
6   import eu.fbk.dkm.premon.premonitor.propbank.*;
7   import eu.fbk.dkm.premon.util.NF;
8   import eu.fbk.dkm.utils.CommandLine;
9   import eu.fbk.rdfpro.*;
10  import eu.fbk.rdfpro.util.Algebra;
11  import eu.fbk.rdfpro.util.QuadModel;
12  import org.openrdf.model.Statement;
13  import org.openrdf.model.URI;
14  import org.openrdf.model.Value;
15  import org.openrdf.model.impl.ValueFactoryImpl;
16  import org.openrdf.model.vocabulary.DCTERMS;
17  import org.openrdf.model.vocabulary.OWL;
18  import org.openrdf.model.vocabulary.RDF;
19  import org.openrdf.query.BindingSet;
20  import org.openrdf.query.algebra.TupleExpr;
21  import org.openrdf.rio.RDFHandler;
22  import org.openrdf.rio.RDFHandlerException;
23  import org.slf4j.Logger;
24  import org.slf4j.LoggerFactory;
25  
26  import javax.xml.bind.JAXBContext;
27  import javax.xml.bind.Unmarshaller;
28  import java.io.File;
29  import java.util.*;
30  import java.util.regex.Matcher;
31  import java.util.regex.Pattern;
32  
33  public class Premonitor_old {
34  
35  	static class PBfinfo {
36  		private String fileName;
37  		private String type;
38  		private String lemma;
39  
40  		public String getFileName() {
41  			return fileName;
42  		}
43  
44  		public String getType() {
45  			return type;
46  		}
47  
48  		public String getLemma() {
49  			return lemma;
50  		}
51  
52  		public PBfinfo(String fileName, boolean isOntoNotes) throws Exception {
53  			this.fileName = fileName;
54  			this.type = "v";
55  			this.lemma = fileName.replaceAll("\\.xml", "");
56  
57  			if (isOntoNotes) {
58  				Matcher matcher = ONTONOTES_FILENAME_PATTERN.matcher(fileName);
59  				if (matcher.matches()) {
60  					this.type = matcher.group(2);
61  					this.lemma = matcher.group(1);
62  				}
63  				else {
64  					throw new Exception("File " + fileName + " does not appear to be a good OntoNotes frame file");
65  				}
66  			}
67  
68  		}
69  	}
70  
71  	/*todo:
72  
73  	- aggiungere component
74  	- aggiungere sameAs dei component
75  
76  	- Decidere per l'ontologia
77  	- NomBank
78  
79  	*/
80  
81  	private static final Logger LOGGER = LoggerFactory.getLogger(Premonitor_old.class);
82  
83  	static final String WN_NAMESPACE = "http://wordnet-rdf.princeton.edu/wn31/";
84  	static final Pattern ONTONOTES_FILENAME_PATTERN = Pattern.compile("(.*)-([a-z]+)\\.xml");
85  	static final Pattern THETA_NAME_PATTERN = Pattern.compile("^([^0-9]+)([0-9]+)$");
86  	static final String VN_NAME_REGEXP = "^[^0-9]+-";
87  	static final Pattern VN_CODE_PATTERN = Pattern.compile("^[0-9]+(\\.[0-9]+)*(-[0-9]+)*$");
88  
89  	static final ValueFactoryImpl factory = ValueFactoryImpl.getInstance();
90  
91  	static final String DEFAULT_LANGUAGE = "en";
92  	static final String DEFAULT_NAMESPACE = "http://pb2rdf.org/";
93  
94  	// Bugs!
95  	private static HashMap<String, String> bugMap = new HashMap<String, String>();
96  
97  	static {
98  		bugMap.put("@", "2"); // overburden-v.xml
99  		bugMap.put("av", "adv"); // turn-v.xml (turn.15)
100 		bugMap.put("ds", "dis"); // assume-v.xml
101 		bugMap.put("a", "agent"); // evolve-v.xml
102 		bugMap.put("pred", "prd"); // flatten-v.xml
103 		bugMap.put("o", "0"); // be.xml (be.04)
104 		bugMap.put("emitter of hoot", "0"); // hoot.xml
105 	}
106 
107 	private static HashMap<String, String> lemmaToTransform = new HashMap();
108 
109 	static {
110 		lemmaToTransform.put("cry+down(e)", "cry+down");
111 	}
112 
113 	private static HashSet<String> fileToDiscard = new HashSet<>();
114 
115 	static {
116 		fileToDiscard.add("except-v.xml");
117 	}
118 
119 	private static HashSet<String> functionTags = new HashSet<String>();
120 
121 	static {
122 		functionTags.add("ext");
123 		functionTags.add("loc");
124 		functionTags.add("dir");
125 		functionTags.add("neg");
126 		functionTags.add("mod");
127 		functionTags.add("adv");
128 		functionTags.add("mnr");
129 		functionTags.add("prd");
130 		functionTags.add("rec");
131 		functionTags.add("tmp");
132 		functionTags.add("prp");
133 		functionTags.add("pnc");
134 		functionTags.add("cau");
135 		functionTags.add("adj");
136 		functionTags.add("com");
137 		functionTags.add("dis");
138 		functionTags.add("dsp");
139 		functionTags.add("gol");
140 		functionTags.add("pag");
141 		functionTags.add("ppt");
142 		functionTags.add("rcl");
143 		functionTags.add("slc");
144 		functionTags.add("vsp");
145 		functionTags.add("lvb");
146 	}
147 
148 	private static HashMap<String, String> additionalWords = new HashMap<>();
149 
150 	static {
151 		additionalWords.put("through", "prep");
152 		additionalWords.put("vent", "n");
153 		additionalWords.put("away", "r");
154 		additionalWords.put("about", "r");
155 		additionalWords.put("back", "r");
156 		additionalWords.put("upon", "prep");
157 		additionalWords.put("aback", "r");
158 		additionalWords.put("down", "r");
159 		additionalWords.put("around", "r");
160 		additionalWords.put("out", "r");
161 		additionalWords.put("hold", "n");
162 		additionalWords.put("across", "r");
163 		additionalWords.put("along", "r");
164 		additionalWords.put("by", "prep");
165 		additionalWords.put("rubber", "n");
166 		additionalWords.put("up", "prep");
167 		additionalWords.put("after", "r");
168 		additionalWords.put("hard", "r");
169 		additionalWords.put("together", "r");
170 		additionalWords.put("on", "r");
171 		additionalWords.put("apart", "r");
172 		additionalWords.put("over", "r");
173 		additionalWords.put("in", "r");
174 		additionalWords.put("like", "prep");
175 		additionalWords.put("forward", "r");
176 		additionalWords.put("tree", "n");
177 		additionalWords.put("clear", "s");
178 		additionalWords.put("birth", "n");
179 		additionalWords.put("it", "pron");
180 		additionalWords.put("forth", "r");
181 		additionalWords.put("off", "r");
182 		additionalWords.put("wrong", "s");
183 		additionalWords.put("the", "art");
184 		additionalWords.put("aside", "r");
185 		additionalWords.put("even", "r");
186 		additionalWords.put("loose", "r");
187 		additionalWords.put("suit", "n");
188 		additionalWords.put("to", "prep");
189 		additionalWords.put("rise", "n");
190 	}
191 
192 	private static void addDefinition(Collection<Statement> statements, URI uri, URI definitionURI, String value, String language) {
193 		Statement statement;
194 		statement = factory.createStatement(definitionURI, RDF.TYPE, LEMON.SENSE_DEFINITION);
195 		statements.add(statement);
196 		statement = factory.createStatement(uri, LEMON.DEFINITION, definitionURI);
197 		statements.add(statement);
198 		statement = factory.createStatement(definitionURI, LEMON.VALUE, factory.createLiteral(value, language));
199 		statements.add(statement);
200 
201 	}
202 
203 	public static void main(String[] args) {
204 
205 		try {
206 			final CommandLine cmd = CommandLine
207 					.parser()
208 					.withName("./premonitor")
209 					.withHeader("Transform a ProbBank instance into RDF")
210 					.withOption("i", "input", "input folder", "FOLDER", CommandLine.Type.DIRECTORY_EXISTING, true, false, true)
211 					.withOption("w", "output", "Output file", "FILE", CommandLine.Type.FILE, true, false, true)
212 					.withOption("l", "lang", String.format("Language for literals, default %s", DEFAULT_LANGUAGE), "ISO-CODE", CommandLine.Type.STRING, true, false, false)
213 					.withOption("v", "non-verbs", "Extract also non-verbs (only for OntoNotes)")
214 					.withOption("o", "ontonotes", "Specify that this is an OntoNotes version of ProbBank")
215 					.withOption("e", "examples", "Extract examples")
216 					.withOption(null, "use-wn-lex", "Use WordNet LexicalEntries when available")
217 					.withOption("s", "single", "Extract single lemma", "LEMMA", CommandLine.Type.STRING, true, false, false)
218 					.withOption(null, "namespace", String.format("Namespace, default %s", DEFAULT_NAMESPACE), "URI", CommandLine.Type.STRING, true, false, false)
219 					.withOption(null, "wordnet", "WordNet RDF triple file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
220 					.withOption(null, "framenet", "FrameNet RDF triple file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
221 					.withOption(null, "verbnet", "VerbNet RDF triple file", "FILE", CommandLine.Type.FILE_EXISTING, true, false, false)
222 					.withLogger(LoggerFactory.getLogger("eu.fbk")).parse(args);
223 
224 			File folder = cmd.getOptionValue("input", File.class);
225 			File outputFile = cmd.getOptionValue("output", File.class);
226 
227 			File wnRDF = null;
228 			if (cmd.hasOption("wordnet")) {
229 				wnRDF = cmd.getOptionValue("wordnet", File.class);
230 			}
231 			File fnRDF = null;
232 			if (cmd.hasOption("framenet")) {
233 				fnRDF = cmd.getOptionValue("framenet", File.class);
234 			}
235 			File vnRDF = null;
236 			if (cmd.hasOption("verbnet")) {
237 				vnRDF = cmd.getOptionValue("verbnet", File.class);
238 			}
239 
240 			String language = DEFAULT_LANGUAGE;
241 			if (cmd.hasOption("lang")) {
242 				language = cmd.getOptionValue("lang", String.class);
243 			}
244 
245 			boolean onlyVerbs = !cmd.hasOption("non-verbs");
246 			boolean isOntoNotes = cmd.hasOption("ontonotes");
247 			boolean extractExamples = cmd.hasOption("examples");
248 			boolean useWordNetLEs = cmd.hasOption("use-wn-lex");
249 
250 			String onlyOne = null;
251 			if (cmd.hasOption("single")) {
252 				onlyOne = cmd.getOptionValue("single", String.class);
253 			}
254 
255 			String namespace = DEFAULT_NAMESPACE;
256 			if (cmd.hasOption("namespace")) {
257 				namespace = cmd.getOptionValue("namespace", String.class);
258 			}
259 
260 			// Fix due to XML library
261 			System.setProperty("javax.xml.accessExternalDTD", "file");
262 
263 			JAXBContext jaxbContext = JAXBContext.newInstance(Frameset.class);
264 			Unmarshaller jaxbUnmarshaller = jaxbContext.createUnmarshaller();
265 
266 			HashSet<Statement> statements = new HashSet<Statement>();
267 			Statement statement;
268 
269 //			String toTokenize = "a_beautiful_mind";
270 //			String[] tokens = toTokenize.split("_");
271 //			Resource base = factory.createURI("http://pb2rdf.org/look+up");
272 //
273 //			if (!(base instanceof URI)) {
274 //				throw new Exception("Input value must be a URI");
275 //			}
276 //
277 //			for (int i = 0; i < tokens.length; i++) {
278 //				String token = tokens[i];
279 //				BNode thisBNode = factory.createBNode();
280 //
281 //				URI componentURI = factory.createURI(base.toString() + "_c" + (i + 1));
282 //				statement = factory.createStatement(componentURI, RDF.TYPE, LEMON.COMPONENT);
283 //				statements.add(statement);
284 //
285 //				URI prev = RDF.REST;
286 //				if (i == 0) {
287 //					prev = LEMON.DECOMPOSITION;
288 //				}
289 //
290 //				statement = factory.createStatement(base, prev, thisBNode);
291 //				statements.add(statement);
292 //				statement = factory.createStatement(thisBNode, RDF.FIRST, componentURI);
293 //				statements.add(statement);
294 //
295 //				base = thisBNode;
296 //
297 //				if (i == tokens.length - 1) {
298 //					statement = factory.createStatement(thisBNode, RDF.REST, RDF.NIL);
299 //					statements.add(statement);
300 //				}
301 //			}
302 //
303 //
304 //			System.exit(1);
305 
306 			// Ontology
307 //			statements.addAll(PB2RDF.createOntologyStatements());
308 
309 			// Lexicon
310 			URI lexiconURI = factory.createURI(namespace, "lexicon");
311 			statement = factory.createStatement(lexiconURI, RDF.TYPE, LEMON.LEXICON);
312 			statements.add(statement);
313 			statement = factory.createStatement(lexiconURI, LEMON.LANGUAGE, factory.createLiteral("en"));
314 			statements.add(statement);
315 
316 			HashSet<String> roleNs = new HashSet<String>();
317 			HashSet<String> roleFs = new HashSet<String>();
318 
319 			HashSet<String> roleSetsToIgnore = new HashSet<String>();
320 
321 			final HashSet<URI> wnURIs = new HashSet<URI>();
322 			if (wnRDF != null) {
323 				LOGGER.info("Loading WordNet");
324 				RDFSource source = RDFSources.read(true, true, null, null, wnRDF.getAbsolutePath());
325 				source.emit(new AbstractRDFHandler() {
326 					@Override
327 					public void handleStatement(Statement statement) throws RDFHandlerException {
328 						if (statement.getPredicate().equals(RDF.TYPE) && statement.getObject().equals(LEMON.LEXICAL_ENTRY)) {
329 							if (statement.getSubject() instanceof URI) {
330 								synchronized (wnURIs) {
331 									wnURIs.add((URI) statement.getSubject());
332 								}
333 							}
334 						}
335 					}
336 				}, 1);
337 				LOGGER.info("Loaded {} URIs", wnURIs.size());
338 			}
339 
340 			Multimap<String, URI> fnFrames = HashMultimap.create();
341 			if (fnRDF != null) {
342 				LOGGER.info("Loading FrameNet");
343 				final QuadModel model = QuadModel.create();
344 				RDFSource source = RDFSources.read(true, true, null, null, fnRDF.getAbsolutePath());
345 				source.emit(new AbstractRDFHandler() {
346 					@Override
347 					public void handleStatement(Statement statement) throws RDFHandlerException {
348 						if (statement.getObject().equals(LEMON.LEXICAL_SENSE) && statement.getPredicate().equals(RDF.TYPE)) {
349 							synchronized (model) {
350 								model.add(statement);
351 							}
352 						}
353 						if (statement.getPredicate().equals(PURL.LABEL)) {
354 							synchronized (model) {
355 								model.add(statement);
356 							}
357 						}
358 					}
359 				}, 1);
360 				TupleExpr query = Algebra.parseTupleExpr(
361 						"SELECT ?s ?l\n" +
362 								"WHERE {\n" +
363 								"\t?s a <http://lemon-model.net/lemon#LexicalSense> .\n" +
364 								"\t?s <http://purl.org/olia/ubyCat.owl#label> ?l\n" +
365 								"}",
366 						null, null);
367 				Iterator<BindingSet> bindingSetIterator = model.evaluate(query, null, null);
368 				while (bindingSetIterator.hasNext()) {
369 					BindingSet bindings = bindingSetIterator.next();
370 					Value fnFrame = bindings.getValue("l");
371 					Value fnSense = bindings.getValue("s");
372 					if (fnSense instanceof URI) {
373 						String stringValue = fnFrame.stringValue().toLowerCase();
374 						fnFrames.put(stringValue, (URI) fnSense);
375 					}
376 				}
377 			}
378 
379 			Multimap<String, URI> vnFrames = HashMultimap.create();
380 
381 			if (vnRDF != null) {
382 				LOGGER.info("Loading VerbNet");
383 				final QuadModel model = QuadModel.create();
384 				RDFSource source = RDFSources.read(true, true, null, null, vnRDF.getAbsolutePath());
385 				source.emit(new AbstractRDFHandler() {
386 					@Override
387 					public void handleStatement(Statement statement) throws RDFHandlerException {
388 						if (statement.getObject().equals(LEMON.LEXICAL_SENSE) && statement.getPredicate().equals(RDF.TYPE)) {
389 							synchronized (model) {
390 								model.add(statement);
391 							}
392 						}
393 						if (statement.getPredicate().equals(PURL.LABEL)) {
394 							synchronized (model) {
395 								model.add(statement);
396 							}
397 						}
398 						if (statement.getPredicate().equals(PURL.SEMANTIC_LABEL)) {
399 							synchronized (model) {
400 								model.add(statement);
401 							}
402 						}
403 					}
404 				}, 1);
405 
406 				TupleExpr query;
407 				Iterator<BindingSet> bindingSetIterator;
408 
409 				// Frames
410 				query = Algebra.parseTupleExpr(
411 						"SELECT ?l ?s WHERE {\n" +
412 								"\t?s a <http://lemon-model.net/lemon#LexicalSense> .\n" +
413 								"\t?s <http://purl.org/olia/ubyCat.owl#semanticLabel> ?b .\n" +
414 								"\t?b <http://purl.org/olia/ubyCat.owl#label> ?l\n" +
415 								"}",
416 						null, null);
417 				bindingSetIterator = model.evaluate(query, null, null);
418 				while (bindingSetIterator.hasNext()) {
419 					BindingSet bindings = bindingSetIterator.next();
420 					Value vnFrame = bindings.getValue("l");
421 					Value vnSense = bindings.getValue("s");
422 					if (vnSense instanceof URI) {
423 						String stringValue = vnFrame.stringValue();
424 						stringValue = getSenseNumberOnly(stringValue);
425 						vnFrames.put(stringValue, (URI) vnSense);
426 					}
427 				}
428 			}
429 			for (String vnSense : vnFrames.keySet()) {
430 
431 				URI vnSenseURI = createVerbNetURIForSense(vnSense, namespace);
432 				statement = factory.createStatement(vnSenseURI, RDF.TYPE, LEMON.LEXICAL_SENSE);
433 				statements.add(statement);
434 
435 				for (URI sense : vnFrames.get(vnSense)) {
436 					statement = factory.createStatement(sense, LEMON.BROADER, vnSenseURI);
437 					statements.add(statement);
438 				}
439 			}
440 
441 
442 			// First tour
443 			LOGGER.info("Getting list of roles");
444 			HashSet<String> thetaRoles = new HashSet<String>();
445 			Multimap<String, String> rolesForSense = HashMultimap.create();
446 
447 			HashSet<String> allExternalTokens = new HashSet<>();
448 
449 			for (File file : Files.fileTreeTraverser().preOrderTraversal(folder)) {
450 
451 				if (discardFile(file, onlyVerbs, isOntoNotes)) {
452 					continue;
453 				}
454 
455 				PBfinfo fileInfo;
456 				try {
457 					fileInfo = new PBfinfo(file.getName(), isOntoNotes);
458 				} catch (Exception e) {
459 					throw e;
460 				}
461 
462 				String fileName = fileInfo.getFileName();
463 				String type = fileInfo.getType();
464 				String lemmaFromName = fileInfo.getLemma();
465 
466 				if (fileToDiscard.contains(fileName)) {
467 					continue;
468 				}
469 
470 				if (onlyOne != null && !onlyOne.equals(fileInfo.getLemma())) {
471 					continue;
472 				}
473 
474 				Frameset frameset = (Frameset) jaxbUnmarshaller.unmarshal(file);
475 				List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
476 
477 				for (Object predicate : noteOrPredicate) {
478 					if (predicate instanceof Predicate) {
479 
480 						String lemma = ((Predicate) predicate).getLemma().replace('_', '+').replace(' ', '+');
481 						if (lemmaToTransform.keySet().contains(lemma)) {
482 							lemma = lemmaToTransform.get(lemma);
483 						}
484 						String[] tokens = lemma.split("\\+");
485 
486 						for (String token : tokens) {
487 							if (token.equals(lemmaFromName)) {
488 								continue;
489 							}
490 							allExternalTokens.add(token);
491 						}
492 
493 
494 						List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
495 						for (Object roleset : noteOrRoleset) {
496 							if (roleset instanceof Roleset) {
497 
498 								List<Object> rolesOrExample = ((Roleset) roleset).getNoteOrRolesOrExampleOrAliases();
499 								for (Object roles : rolesOrExample) {
500 									if (roles instanceof Roles) {
501 
502 										int okRoles = 0;
503 
504 										List<Object> noteOrRole = ((Roles) roles).getNoteOrRole();
505 										for (Object role : noteOrRole) {
506 											if (role instanceof Role) {
507 												String n = ((Role) role).getN();
508 												String f = ((Role) role).getF();
509 
510 												NF nf = new NF(n, f);
511 
512 												// Remove bugs
513 												String argName = nf.getArgName();
514 												if (argName == null) {
515 													continue;
516 												}
517 												if (bugMap.containsKey(argName)) {
518 													continue;
519 												}
520 
521 												if (nf.getN() != null) {
522 													roleNs.add(nf.getN());
523 													okRoles++;
524 												}
525 												if (nf.getF() != null) {
526 													roleFs.add(nf.getF());
527 												}
528 
529 												List<Vnrole> vnroleList = ((Role) role).getVnrole();
530 												for (Vnrole vnrole : vnroleList) {
531 													if (vnrole.getVntheta() != null && vnrole.getVntheta().trim().length() > 0) {
532 														String okRole = getThetaName(vnrole.getVntheta().toLowerCase());
533 														thetaRoles.add(okRole);
534 
535 														String vnSenseString = vnrole.getVncls();
536 														HashSet<String> senses = getGoodSensesOnly(vnSenseString);
537 														for (String sense : senses) {
538 															rolesForSense.put(sense, okRole);
539 														}
540 													}
541 												}
542 
543 											}
544 										}
545 
546 										if (okRoles == 0) {
547 											roleSetsToIgnore.add(((Roleset) roleset).getId());
548 										}
549 									}
550 								}
551 							}
552 						}
553 					}
554 				}
555 			}
556 
557 //			System.out.println(allExternalTokens.size());
558 //			System.out.println(allExternalTokens);
559 //			System.exit(1);
560 
561 			for (String thetaRole : thetaRoles) {
562 				URI vnRoleURI = createVerbNetURIForRole(thetaRole, namespace);
563 				statement = factory.createStatement(vnRoleURI, RDF.TYPE, PB2RDF.VN_THETA_ROLE_C);
564 				statements.add(statement);
565 			}
566 
567 			for (String vnSense : rolesForSense.keySet()) {
568 
569 				URI vnSenseURI = createVerbNetURIForSense(vnSense, namespace);
570 
571 				// It should already exist from parsing of the VerbNet dataset
572 				statement = factory.createStatement(vnSenseURI, RDF.TYPE, LEMON.LEXICAL_SENSE);
573 				statements.add(statement);
574 
575 				for (String role : rolesForSense.get(vnSense)) {
576 					URI vnSenseRoleURI = createVerbNetURIForSenseRole(vnSense, role, namespace);
577 					URI vnRoleURI = createVerbNetURIForRole(role, namespace);
578 
579 					statement = factory.createStatement(vnSenseRoleURI, RDF.TYPE, LEMON.ARGUMENT);
580 					statements.add(statement);
581 					statement = factory.createStatement(vnSenseRoleURI, LEMON.SEM_ARG, vnSenseURI);
582 					statements.add(statement);
583 					statement = factory.createStatement(vnSenseRoleURI, PB2RDF.VN_THETA_ROLE, vnRoleURI);
584 					statements.add(statement);
585 				}
586 			}
587 
588 			// Create dictionary
589 			//todo: distinguish between different types of roles (numeric and generic)?
590 
591 			HashMap<String, Statement> roleStatements = new HashMap<String, Statement>();
592 
593 			// There should be only numbers
594 			for (String n : roleNs) {
595 				try {
596 					Integer number = Integer.parseInt(n);
597 					roleStatements.put(number.toString(), factory.createStatement(PB2RDF.createRole(number), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
598 				} catch (Exception ignored) {
599 					// ignored
600 				}
601 			}
602 
603 			// Adding agent
604 			roleStatements.put(NF.AGENT, factory.createStatement(PB2RDF.createRole(NF.AGENT), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
605 			roleStatements.put(NF.MOD, factory.createStatement(PB2RDF.createRole(NF.MOD), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C)); //todo: see NF class
606 
607 			for (String functionTag : functionTags) {
608 				roleStatements.put(functionTag, factory.createStatement(PB2RDF.createRole(functionTag), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
609 			}
610 
611 
612 			for (String f : roleFs) {
613 				roleStatements.put(f, factory.createStatement(PB2RDF.createRole(f), RDF.TYPE, PB2RDF.PB_THETA_ROLE_C));
614 			}
615 
616 			for (String key : roleStatements.keySet()) {
617 				statements.add(roleStatements.get(key));
618 			}
619 
620 
621 			// Second tour
622 			LOGGER.info("Parsing PropBank files");
623 			for (File file : Files.fileTreeTraverser().preOrderTraversal(folder)) {
624 
625 				if (discardFile(file, onlyVerbs, isOntoNotes)) {
626 					continue;
627 				}
628 
629 				PBfinfo fileInfo;
630 				try {
631 					fileInfo = new PBfinfo(file.getName(), isOntoNotes);
632 				} catch (Exception e) {
633 					throw e;
634 				}
635 
636 				String fileName = fileInfo.getFileName();
637 				String type = fileInfo.getType();
638 				String lemmaFromName = fileInfo.getLemma();
639 
640 				if (fileToDiscard.contains(fileName)) {
641 					continue;
642 				}
643 
644 				if (onlyOne != null && !onlyOne.equals(lemmaFromName)) {
645 					continue;
646 				}
647 
648 				LOGGER.debug("{} ({})", fileName, type);
649 
650 				Frameset frameset = (Frameset) jaxbUnmarshaller.unmarshal(file);
651 				List<Object> noteOrPredicate = frameset.getNoteOrPredicate();
652 
653 				for (Object predicate : noteOrPredicate) {
654 					if (predicate instanceof Predicate) {
655 
656 						String lemma = ((Predicate) predicate).getLemma().replace('_', '+').replace(' ', '+');
657 						if (lemmaToTransform.keySet().contains(lemma)) {
658 							lemma = lemmaToTransform.get(lemma);
659 						}
660 
661 						String wnLemma = lemma + "-" + type;
662 
663 						URI predicateURI = addLexicalEntry(useWordNetLEs, namespace, wnLemma, statements, lexiconURI, language, wnURIs);
664 
665 						List<Object> noteOrRoleset = ((Predicate) predicate).getNoteOrRoleset();
666 						for (Object roleset : noteOrRoleset) {
667 							if (roleset instanceof Roleset) {
668 								String rolesetID = ((Roleset) roleset).getId();
669 
670 								String[] vnClasses = new String[0];
671 								if (((Roleset) roleset).getVncls() != null) {
672 									vnClasses = ((Roleset) roleset).getVncls().trim().split("\\s+");
673 								}
674 
675 								String[] fnPredicates = new String[0];
676 								if (((Roleset) roleset).getFramnet() != null) {
677 									fnPredicates = ((Roleset) roleset).getFramnet().trim().toLowerCase().split("\\s+");
678 								}
679 
680 								if (roleSetsToIgnore.contains(rolesetID)) {
681 									continue;
682 								}
683 
684 								String name = ((Roleset) roleset).getName();
685 
686 								URI senseURI = factory.createURI(namespace, rolesetID);
687 
688 								//todo: add roleset as a property?
689 								statement = factory.createStatement(senseURI, RDF.TYPE, LEMON.LEXICAL_SENSE);
690 								statements.add(statement);
691 								statement = factory.createStatement(senseURI, DCTERMS.SOURCE, factory.createLiteral(fileName));
692 								statements.add(statement);
693 								statement = factory.createStatement(predicateURI, LEMON.SENSE, senseURI);
694 								statements.add(statement);
695 
696 								for (String vnSense : vnClasses) {
697 
698 									if (!vnFrames.containsKey(vnSense)) {
699 										continue;
700 									}
701 
702 									URI vnSenseURI = createVerbNetURIForSense(vnSense, namespace);
703 
704 									statement = factory.createStatement(senseURI, LEMON.BROADER, vnSenseURI);
705 									statements.add(statement);
706 								}
707 
708 								for (String fnPredicate : fnPredicates) {
709 									if (!fnFrames.containsKey(fnPredicate)) {
710 										continue;
711 									}
712 
713 									for (URI fnSenseURI : fnFrames.get(fnPredicate)) {
714 										statement = factory.createStatement(senseURI, PB2RDF.SIMILAR, fnSenseURI);
715 										statements.add(statement);
716 									}
717 								}
718 
719 
720 								if (name != null && name.length() > 0) {
721 									URI definitionURI = factory.createURI(namespace, rolesetID + "_def");
722 									addDefinition(statements, senseURI, definitionURI, name, language);
723 								}
724 
725 								List<Object> rolesOrExample = ((Roleset) roleset).getNoteOrRolesOrExampleOrAliases();
726 
727 								List<Example> examples = new ArrayList<Example>();
728 
729 								for (Object rOrE : rolesOrExample) {
730 									if (rOrE instanceof Roles) {
731 										List<Object> noteOrRole = ((Roles) rOrE).getNoteOrRole();
732 										for (Object role : noteOrRole) {
733 											if (role instanceof Role) {
734 												String n = ((Role) role).getN();
735 												String f = ((Role) role).getF();
736 												String descr = ((Role) role).getDescr();
737 												List<Vnrole> vnroleList = ((Role) role).getVnrole();
738 
739 												NF nf = new NF(n, f);
740 												String argName = nf.getArgName();
741 												if (argName == null) {
742 													//todo: this should never happen; check consistency between the list of roles and the examples
743 													continue;
744 												}
745 
746 												// Bugs!
747 												if (bugMap.containsKey(argName)) {
748 													argName = bugMap.get(argName);
749 												}
750 
751 												String roleText = rolesetID + "_role-" + nf.getArgName();
752 												URI roleURI = factory.createURI(namespace, roleText);
753 
754 												statement = factory.createStatement(roleURI, RDF.TYPE, LEMON.ARGUMENT);
755 												statements.add(statement);
756 												statement = factory.createStatement(senseURI, LEMON.SEM_ARG, roleURI);
757 												statements.add(statement);
758 												try {
759 													statement = factory.createStatement(roleURI, PB2RDF.PB_THETA_ROLE, roleStatements.get(argName).getSubject());
760 													statements.add(statement);
761 												} catch (Exception e) {
762 													LOGGER.error(argName + " " + roleText + " " + fileName);
763 												}
764 
765 												if (descr != null && descr.length() > 0) {
766 													URI definitionURI = factory.createURI(namespace, roleText + "_def");
767 													addDefinition(statements, roleURI, definitionURI, descr, language);
768 												}
769 
770 												for (Vnrole vnrole : vnroleList) {
771 													String vnSenseString = vnrole.getVncls();
772 													String vnTheta = vnrole.getVntheta();
773 
774 													HashSet<String> senses = getGoodSensesOnly(vnSenseString);
775 
776 													if (vnTheta != null && vnTheta.trim().length() > 0) {
777 														for (String sense : senses) {
778 															URI uri = createVerbNetURIForSenseRole(sense, vnTheta, namespace);
779 															statement = factory.createStatement(roleURI, PB2RDF.ARG_SIMILAR, uri);
780 															statements.add(statement);
781 														}
782 													}
783 												}
784 											}
785 										}
786 									}
787 
788 									if (extractExamples) {
789 										if (rOrE instanceof Example) {
790 											examples.add((Example) rOrE);
791 										}
792 									}
793 								}
794 
795 								//todo: shall we start from 0?
796 								int example = 1;
797 
798 								for (Example rOrE : examples) {
799 									String text = null;
800 									Inflection inflection = null;
801 
802 									String exType = rOrE.getType();
803 									String exName = rOrE.getName();
804 									String exSrc = rOrE.getSrc();
805 
806 									List<Rel> myRels = new ArrayList<Rel>();
807 									List<Arg> myArgs = new ArrayList<Arg>();
808 
809 									List<Object> exThings = rOrE.getInflectionOrNoteOrTextOrArgOrRel();
810 									for (Object thing : exThings) {
811 										if (thing instanceof Text) {
812 											text = ((Text) thing).getvalue().replaceAll("\\s+", " ").trim();
813 										}
814 										if (thing instanceof Inflection) {
815 											inflection = (Inflection) thing;
816 										}
817 
818 										if (thing instanceof Arg) {
819 											myArgs.add((Arg) thing);
820 										}
821 
822 										// Should be one, but it's not defined into the DTD
823 										if (thing instanceof Rel) {
824 											myRels.add((Rel) thing);
825 										}
826 									}
827 
828 									if (text != null && text.length() > 0) {
829 
830 										String exampleStr = rolesetID + "_ex" + (examples.size() > 1 ? example++ : "");
831 										URI exampleURI = factory.createURI(namespace, exampleStr);
832 
833 										statement = factory.createStatement(exampleURI, RDF.TYPE, LEMON.USAGE_EXAMPLE);
834 										statements.add(statement);
835 										statement = factory.createStatement(senseURI, LEMON.EXAMPLE, exampleURI);
836 										statements.add(statement);
837 
838 										// Properties
839 										addProperty(statements, exampleURI, PB2RDF.PB_EX_NAME, exName, language);
840 										addProperty(statements, exampleURI, PB2RDF.PB_EX_SRC, exSrc, language);
841 										addProperty(statements, exampleURI, PB2RDF.PB_EX_TYPE, exType, language);
842 										addProperty(statements, exampleURI, LEMON.VALUE, text, language);
843 
844 										Map<String, List<Arg>> exampleArgs = new HashMap<String, List<Arg>>();
845 										for (Arg myArg : myArgs) {
846 
847 											NF nf = new NF(myArg.getN(), myArg.getF());
848 											String argName = nf.getArgName();
849 
850 											if (argName == null) {
851 												//todo: this should not happen, but it happens
852 												continue;
853 											}
854 
855 											// Bugs!
856 											if (bugMap.containsKey(argName)) {
857 												argName = bugMap.get(argName);
858 											}
859 
860 											if (!exampleArgs.containsKey(argName)) {
861 												exampleArgs.put(argName, new ArrayList<Arg>());
862 											}
863 											exampleArgs.get(argName).add(myArg);
864 										}
865 
866 										for (Map.Entry<String, List<Arg>> entry : exampleArgs.entrySet()) {
867 											String argName = entry.getKey();
868 											List<Arg> value = entry.getValue();
869 											for (int i = 0; i < value.size(); i++) {
870 												Arg myArg = value.get(i);
871 												String argValue = myArg.getvalue();
872 												if (argValue == null) {
873 													throw new Exception("argValue is null");
874 												}
875 
876 												String addendum = "";
877 												if (value.size() > 1) {
878 													addendum = "_" + (i + 1);
879 												}
880 
881 												URI argURI = factory.createURI(namespace, exampleStr + "_arg-" + argName + addendum);
882 
883 												statement = factory.createStatement(argURI, RDF.TYPE, PB2RDF.EX_ARG_C);
884 												statements.add(statement);
885 												statement = factory.createStatement(exampleURI, PB2RDF.PB_EX_ARG, argURI);
886 												statements.add(statement);
887 												statement = factory.createStatement(argURI, LEMON.VALUE, factory.createLiteral(argValue, language));
888 												statements.add(statement);
889 												statement = factory.createStatement(argURI, PB2RDF.PB_THETA_ROLE, roleStatements.get(argName).getSubject());
890 												statements.add(statement);
891 											}
892 										}
893 
894 										for (int i = 0; i < myRels.size(); i++) {
895 											Rel myRel = myRels.get(i);
896 
897 											String addendum = "";
898 											if (myRels.size() > 1) {
899 												addendum += "_" + (i + 1);
900 											}
901 
902 											NF nf = new NF(null, myRel.getF());
903 											String relName = nf.getArgName();
904 											String relValue = myRel.getvalue();
905 
906 											if (relValue == null) {
907 												throw new Exception("argValue is null");
908 											}
909 
910 											URI relURI = factory.createURI(namespace, exampleStr + "_rel" + addendum);
911 
912 											statement = factory.createStatement(relURI, RDF.TYPE, PB2RDF.EX_REL_C);
913 											statements.add(statement);
914 											statement = factory.createStatement(exampleURI, PB2RDF.PB_EX_REL, relURI);
915 											statements.add(statement);
916 											statement = factory.createStatement(relURI, LEMON.VALUE, factory.createLiteral(relValue, language));
917 											statements.add(statement);
918 											if (relName != null) {
919 												statement = factory.createStatement(relURI, PB2RDF.PB_THETA_ROLE, roleStatements.get(relName).getSubject());
920 												statements.add(statement);
921 											}
922 										}
923 
924 										if (inflection != null) {
925 											URI inflectionURI = factory.createURI(namespace, exampleStr + "_inflection");
926 
927 											statement = factory.createStatement(inflectionURI, RDF.TYPE, PB2RDF.INFLECTION_C);
928 											statements.add(statement);
929 											statement = factory.createStatement(exampleURI, PB2RDF.PB_EX_INFLECTION, inflectionURI);
930 											statements.add(statement);
931 
932 											// Properties
933 											addProperty(statements, inflectionURI, PB2RDF.PB_INF_ASPECT, inflection.getAspect(), language);
934 											addProperty(statements, inflectionURI, PB2RDF.PB_INF_FORM, inflection.getForm(), language);
935 											addProperty(statements, inflectionURI, PB2RDF.PB_INF_PERSON, inflection.getPerson(), language);
936 											addProperty(statements, inflectionURI, PB2RDF.PB_INF_TENSE, inflection.getTense(), language);
937 											addProperty(statements, inflectionURI, PB2RDF.PB_INF_VOICE, inflection.getVoice(), language);
938 										}
939 									}
940 
941 								}
942 
943 							}
944 						}
945 					}
946 				}
947 			}
948 
949 			RDFSource source = RDFSources.wrap(statements);
950 			try {
951 				RDFHandler rdfHandler = RDFHandlers.write(null, 1000, outputFile.getAbsolutePath());
952 				RDFProcessors
953 						.sequence(RDFProcessors.prefix(null), RDFProcessors.unique(false))
954 						.apply(source, rdfHandler, 1);
955 			} catch (Exception e) {
956 				LOGGER.error("Input/output error, the file {} has not been saved ({})", outputFile.getAbsolutePath(), e.getMessage());
957 				throw new RDFHandlerException(e);
958 			}
959 
960 			LOGGER.info("File {} saved", outputFile.getAbsolutePath());
961 
962 		} catch (Throwable ex) {
963 			CommandLine.fail(ex);
964 		}
965 
966 
967 	}
968 
969 	private static URI addLexicalEntry(boolean useWordNetLEs, String namespace, String lemma, Collection<Statement> statements, URI lexiconURI, String language, HashSet<URI> wnURIs) {
970 		Statement statement;
971 		URI wnURI = factory.createURI(WN_NAMESPACE, lemma);
972 
973 		URI predicateURI;
974 		if (wnURIs.contains(wnURI) && useWordNetLEs) {
975 			predicateURI = wnURI;
976 		}
977 		else {
978 			LOGGER.info("Word {} is not in WordNet", lemma);
979 			LOGGER.info(wnURI.toString());
980 
981 			// Tokenize
982 			String[] tokens = lemma.replaceAll("-[a-z]+$", "").split("\\+");
983 			if (tokens.length > 1) {
984 
985 			}
986 
987 			predicateURI = factory.createURI(namespace, lemma);
988 			statement = factory.createStatement(predicateURI, RDF.TYPE, LEMON.LEXICAL_ENTRY);
989 			statements.add(statement);
990 			statement = factory.createStatement(lexiconURI, LEMON.ENTRY, predicateURI);
991 			statements.add(statement);
992 
993 			// todo: aggiungere component
994 
995 			if (wnURIs.contains(wnURI)) {
996 				statement = factory.createStatement(predicateURI, OWL.SAMEAS, wnURI);
997 				statements.add(statement);
998 
999 				// todo: aggiungere sameas dei component
1000 			}
1001 
1002 			// Using this paradigm, there is only one form
1003 			URI formURI = factory.createURI(namespace, lemma + "_form");
1004 			statement = factory.createStatement(predicateURI, LEMON.CANONICAL_FORM, formURI);
1005 			statements.add(statement);
1006 			statement = factory.createStatement(formURI, RDF.TYPE, LEMON.FORM);
1007 			statements.add(statement);
1008 			statement = factory.createStatement(formURI, LEMON.WRITTEN_REP, factory.createLiteral(lemma, language));
1009 			statements.add(statement);
1010 		}
1011 
1012 		return predicateURI;
1013 	}
1014 
1015 	private static URI createVerbNetURIForRole(String role, String namespace) {
1016 		String vnID = "vn_role_" + role;
1017 		return factory.createURI(namespace, vnID);
1018 	}
1019 
1020 	private static URI createVerbNetURIForSense(String sense, String namespace) {
1021 		String vnID = "vn_" + sense;
1022 		return factory.createURI(namespace, vnID);
1023 	}
1024 
1025 	private static URI createVerbNetURIForSenseRole(String sense, String role, String namespace) {
1026 		String vnRoleID = "vn_role_" + sense + "_" + role;
1027 		return factory.createURI(namespace, vnRoleID);
1028 	}
1029 
1030 	private static String isGoodSense(String sense) {
1031 		sense = getSenseNumberOnly(sense);
1032 		Matcher matcher = VN_CODE_PATTERN.matcher(sense);
1033 		if (!matcher.matches()) {
1034 			LOGGER.trace("{} does not pass the match test", sense);
1035 			return null;
1036 		}
1037 
1038 		return sense;
1039 	}
1040 
1041 	private static HashSet<String> getGoodSensesOnly(String vnSenseString) {
1042 		HashSet<String> ret = new HashSet<String>();
1043 
1044 		if (vnSenseString != null && vnSenseString.trim().length() > 0) {
1045 
1046 			// Fix: attest-v.xml
1047 			if (vnSenseString.equals("29. 5")) {
1048 				vnSenseString = "29.5";
1049 			}
1050 
1051 			String[] vnSenses = vnSenseString.split("[\\s,]+");
1052 
1053 			for (String sense : vnSenses) {
1054 				String okSense = isGoodSense(sense);
1055 				if (okSense != null) {
1056 					ret.add(okSense);
1057 				}
1058 			}
1059 		}
1060 
1061 		return ret;
1062 	}
1063 
1064 	private static String getSenseNumberOnly(String senseName) {
1065 
1066 		// Fix: conflict-v.xml
1067 		if (senseName.equals("36.4-136.")) {
1068 			senseName = "36.4-1";
1069 		}
1070 
1071 		// Fix: cram-v.xml
1072 		if (senseName.equals("14-1S")) {
1073 			senseName = "14-1";
1074 		}
1075 
1076 		// Fix: plan-v.xml
1077 		if (senseName.equals("62t")) {
1078 			senseName = "62";
1079 		}
1080 
1081 		// Fix: plot-v.xml
1082 		if (senseName.equals("25.2t")) {
1083 			senseName = "25.2";
1084 		}
1085 
1086 		return senseName.replaceAll(VN_NAME_REGEXP, "");
1087 	}
1088 
1089 	private static String getThetaName(String name) {
1090 		Matcher matcher = THETA_NAME_PATTERN.matcher(name);
1091 		if (matcher.matches()) {
1092 			String num = matcher.group(2);
1093 			if (num.equals("1")) {
1094 				return matcher.group(1);
1095 			}
1096 			else {
1097 				return "co-" + matcher.group(1);
1098 			}
1099 		}
1100 		return name;
1101 	}
1102 
1103 	private static boolean discardFile(File file, boolean onlyVerbs, boolean isOntoNotes) {
1104 		if (file.isDirectory()) {
1105 			LOGGER.trace("File {} is a directory", file.getName());
1106 			return true;
1107 		}
1108 
1109 		if (!file.getAbsolutePath().endsWith(".xml")) {
1110 			LOGGER.trace("File {} is not XML", file.getName());
1111 			return true;
1112 		}
1113 
1114 		if (onlyVerbs && isOntoNotes) {
1115 			if (!file.getAbsolutePath().endsWith("-v.xml")) {
1116 				LOGGER.trace("File {} is not a verb", file.getName());
1117 				return true;
1118 			}
1119 		}
1120 
1121 		return false;
1122 	}
1123 
1124 	private static void addProperty(Collection<Statement> statements, URI uri, URI propertyName, String value, String language) {
1125 		if (value != null && value.length() > 0) {
1126 			Statement statement = factory.createStatement(uri, propertyName, factory.createLiteral(value, language));
1127 			statements.add(statement);
1128 		}
1129 	}
1130 
1131 }