1   package eu.fbk.dkm.premon.premonitor;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.dkm.premon.vocab.*;
5   import eu.fbk.dkm.utils.FrequencyHashSet;
6   import eu.fbk.rdfpro.RDFHandlers;
7   import org.joox.JOOX;
8   import org.joox.Match;
9   import org.jsoup.Jsoup;
10  import org.openrdf.model.Statement;
11  import org.openrdf.model.URI;
12  import org.openrdf.model.vocabulary.DCTERMS;
13  import org.openrdf.model.vocabulary.RDF;
14  import org.openrdf.model.vocabulary.RDFS;
15  import org.openrdf.model.vocabulary.SKOS;
16  import org.openrdf.rio.RDFHandler;
17  import org.slf4j.Logger;
18  import org.slf4j.LoggerFactory;
19  import org.w3c.dom.Document;
20  import org.w3c.dom.Element;
21  
22  import javax.annotation.Nullable;
23  import javax.xml.parsers.DocumentBuilderFactory;
24  import java.io.File;
25  import java.io.IOException;
26  import java.text.DateFormat;
27  import java.text.SimpleDateFormat;
28  import java.util.*;
29  import java.util.regex.Matcher;
30  import java.util.regex.Pattern;
31  
32  public class FramenetConverter extends Converter {
33  
34      private static final Logger LOGGER = LoggerFactory.getLogger(FramenetConverter.class);
35      HashMap<String, File> paths = new HashMap<>();
36      private String retroMappings = null;
37  
38      // 01/28/2002 04:30:50 PST Mon
39      private static final DateFormat format = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss z E", Locale.ENGLISH);
40  
41      private static final Pattern TOKEN_REGEX = Pattern.compile("[^\\s]+");
42  
43      //        private static final String ONE_FRAME = "Measurable_attributes.xml";
44      private static final String ONE_FRAME = null;
45      private static final Set<String> bugMap = new HashSet<>();
46  
47      private String thisVersion = null;
48  
49      public FramenetConverter(File path, RDFHandler sink, Properties properties, Map<String, URI> wnInfo) {
50          super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
51  
52          paths.put("frame", new File(this.path.getAbsolutePath() + File.separator + "frame"));
53          paths.put("lu", new File(this.path.getAbsolutePath() + File.separator + "lu"));
54          paths.put("luIndex", new File(this.path.getAbsolutePath() + File.separator + "luIndex.xml"));
55          paths.put("semTypes", new File(this.path.getAbsolutePath() + File.separator + "semTypes.xml"));
56          paths.put("frRelation", new File(this.path.getAbsolutePath() + File.separator + "frRelation.xml"));
57          paths.put("retroMappings",
58                  new File(this.path.getAbsolutePath() + File.separator + properties.getProperty("retromapfile")));
59  
60          bugMap.add("Test35");
61          bugMap.add("Test_the_test");
62  
63  //        argumentSeparator = "@";
64  
65          retroMappings = properties.getProperty("retromappings");
66          thisVersion = properties.getProperty("thisversion", "1.5");
67  
68          LOGGER.info("Starting dataset: {}", prefix);
69      }
70  
71      @Override public void convert() throws IOException {
72  
73          addMetaToSink();
74  
75          final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
76  
77          for (String key : paths.keySet()) {
78              File value = paths.get(key);
79              if (!value.exists()) {
80                  LOGGER.error("Path {} does not exist", value.getAbsolutePath());
81              }
82          }
83  
84          try {
85  
86              String tagPrevious = "r1-5";
87              String tagCurrent = "r1-6";
88              String fnPrevious = "r1-5_FrameName";
89  
90              if (thisVersion.equals("1.7")) {
91                  tagPrevious = "r1.6";
92                  tagCurrent = "r1.7";
93                  fnPrevious = "r1.6_FrameName";
94              }
95  
96              Document document;
97              HashSet<String> added = new HashSet<>();
98              HashMap<String, String> changed = new HashMap<>();
99  
100             // Retro Mappings
101             if (retroMappings != null) {
102                 LOGGER.info("Extracting diff file");
103                 document = dbf.newDocumentBuilder().parse(paths.get("retroMappings"));
104 
105                 Match diffs;
106 
107                 diffs = JOOX.$(document).xpath("FrameDiff/Added/Frame");
108                 for (Element diff : diffs) {
109                     added.add(diff.getTextContent().toLowerCase());
110                 }
111                 diffs = JOOX.$(document).xpath("FrameDiff/Changed/Frame");
112                 for (Element diff : diffs) {
113                     String r15 = JOOX.$(diff.getElementsByTagName(tagPrevious)).text().toLowerCase();
114                     String r16 = JOOX.$(diff.getElementsByTagName(tagCurrent)).text().toLowerCase();
115                     changed.put(r16, r15);
116                 }
117 
118                 diffs = JOOX.$(document).xpath("FrameElementDiff/Added/FrameElement");
119                 for (Element diff : diffs) {
120                     added.add(diff.getAttribute("FrameName").toLowerCase() + argumentSeparator + diff.getTextContent()
121                             .toLowerCase());
122                 }
123                 diffs = JOOX.$(document).xpath("FrameElementDiff/Changed/FrameElement");
124                 for (Element diff : diffs) {
125                     String f16 = diff.getAttribute("FrameName").toLowerCase();
126                     String f15 = diff.getAttribute(fnPrevious);
127                     if (f15 == null || f15.length() == 0) {
128                         f15 = f16;
129                     }
130                     f15 = f15.toLowerCase();
131                     String r15 = JOOX.$(diff.getElementsByTagName(tagPrevious)).text().toLowerCase();
132                     String r16 = JOOX.$(diff.getElementsByTagName(tagCurrent)).text().toLowerCase();
133                     changed.put(f16 + argumentSeparator + r16, f15 + argumentSeparator + r15);
134                 }
135 
136             }
137 
138             // luIndex
139             LOGGER.info("Extracting luIndex");
140             document = dbf.newDocumentBuilder().parse(paths.get("luIndex"));
141             Match statusTypes = JOOX.$(document.getElementsByTagName("statusType"));
142             for (Element statusType : statusTypes) {
143                 addStatusToSink(statusType);
144             }
145 
146             // semTypes
147             LOGGER.info("Extracting semTypes");
148             document = dbf.newDocumentBuilder().parse(paths.get("semTypes"));
149             Match semTypes = JOOX.$(document.getElementsByTagName("semType"));
150             for (Element semType : semTypes) {
151                 addSemTypeToSink(semType);
152             }
153 
154             // frRelation
155             FrequencyHashSet<URI> typesFreqs = new FrequencyHashSet<>();
156             FrequencyHashSet<URI> typesFreqsFER = new FrequencyHashSet<>();
157 
158             LOGGER.info("Extracting frRelations");
159             document = dbf.newDocumentBuilder().parse(paths.get("frRelation"));
160             Match frRelationTypes = JOOX.$(document.getElementsByTagName("frameRelationType"));
161             for (Element frRelationType : frRelationTypes) {
162                 String name = frRelationType.getAttribute("name");
163                 URI typeURI = null;
164 
165                 boolean invert = false;
166 
167                 switch (name) {
168                 case "Inheritance":
169                     typeURI = PMOFN.INHERITS_FROM;
170                     break;
171                 case "Subframe":
172                     typeURI = PMOFN.SUBFRAME_OF;
173                     break;
174                 case "Using":
175                     typeURI = PMOFN.USES;
176                     break;
177                 case "See_also":
178                     typeURI = PMOFN.SEE_ALSO;
179                     invert = true;
180                     break;
181                 case "Inchoative_of":
182                     typeURI = PMOFN.IS_INCHOATIVE_OF;
183                     invert = true;
184                     break;
185                 case "Causative_of":
186                     typeURI = PMOFN.IS_CAUSATIVE_OF;
187                     invert = true;
188                     break;
189                 case "Precedes":
190                     typeURI = PMOFN.PRECEDES;
191                     invert = true;
192                     break;
193                 case "Perspective_on":
194                     typeURI = PMOFN.PERSPECTIVE_ON;
195                     break;
196                 case "ReFraming_Mapping":
197                     typeURI = PMOFN.REFRAME_MAPPING;
198                     break;
199                 case "Metaphor":
200                     typeURI = PMOFN.METAPHOR;
201                     break;
202                 }
203 
204                 if (typeURI == null) {
205                     LOGGER.error("typeURI is null ({})", name);
206                     continue;
207                 }
208 
209                 Match frameRelations = JOOX.$(frRelationType.getElementsByTagName("frameRelation"));
210                 for (Element frameRelation : frameRelations) {
211                     String subFrameName = frameRelation.getAttribute("subFrameName");
212                     String superFrameName = frameRelation.getAttribute("superFrameName");
213 
214                     if (invert) {
215                         addStatementToSink(uriForRoleset(superFrameName.toLowerCase()), typeURI,
216                                 uriForRoleset(subFrameName.toLowerCase()));
217                     } else {
218                         addStatementToSink(uriForRoleset(subFrameName.toLowerCase()), typeURI,
219                                 uriForRoleset(superFrameName.toLowerCase()));
220                     }
221                     typesFreqs.add(typeURI);
222 
223                     Match feRelations = JOOX.$(frameRelation.getElementsByTagName("FERelation"));
224                     for (Element feRelation : feRelations) {
225 
226                         // todo: molto brutto
227                         URI relURI = createURI(typeURI.toString() + "FER");
228 
229                         URI arg1 = uriForArgument(subFrameName.toLowerCase(), feRelation.getAttribute("subFEName"));
230                         URI arg2 = uriForArgument(superFrameName.toLowerCase(), feRelation.getAttribute("superFEName"));
231 
232                         if (invert) {
233                             addStatementToSink(arg2, relURI, arg1);
234                         } else {
235                             addStatementToSink(arg1, relURI, arg2);
236                         }
237                         typesFreqsFER.add(relURI);
238                     }
239 
240                 }
241 
242             }
243 
244             // frame
245             LOGGER.info("Extracting frames");
246             int luCount = 0;
247             int mapCount = 0;
248             int mapRoleCount = 0;
249             FrequencyHashSet<URI> semTypesFreq = new FrequencyHashSet<>();
250             FrequencyHashSet<URI> semTypesForFrame = new FrequencyHashSet<>();
251             HashMap<String, URI> lus = new HashMap<>();
252             for (final File file : Files.fileTreeTraverser().preOrderTraversal(paths.get("frame"))) {
253                 if (!file.isDirectory() && file.getName().endsWith(".xml")) {
254                     LOGGER.debug("Processing {} ...", file);
255 
256                     if (ONE_FRAME != null) {
257                         if (!file.getName().equals(ONE_FRAME)) {
258                             continue;
259                         }
260                     }
261 
262                     try {
263                         document = dbf.newDocumentBuilder().parse(file);
264                         final Match frame = JOOX.$(document.getElementsByTagName("frame"));
265 
266                         for (Element element : frame) {
267                             String cBy = frame.attr("cBy");
268                             String cDate = frame.attr("cDate");
269                             String identifier = frame.attr("ID");
270                             String frameName = frame.attr("name");
271 
272                             String lcFrameName = frameName.toLowerCase();
273 
274                             URI frameURI = uriForRoleset(lcFrameName);
275 
276                             URI classMappingURI = null;
277                             if (retroMappings != null) {
278                                 if (!added.contains(lcFrameName)) {
279                                     String toMap = changed.get(lcFrameName);
280                                     mapCount++;
281                                     URI oldFrameURI = uriForRoleset(toMap != null ? toMap : lcFrameName, retroMappings);
282                                     addMappings(frameURI, oldFrameURI, null, null);
283                                     //    if (toMap != null) {
284                                     //        classMappingURI = addSingleMapping(null, prefix, DEFAULT_PRED_SUFFIX, uriForRoleset(lcFrameName),
285                                     //                uriForRoleset(toMap, retroMappings));
286                                     //    } else {
287                                     //        classMappingURI = addSingleMapping(null, prefix, DEFAULT_PRED_SUFFIX, uriForRoleset(lcFrameName),
288                                     //                uriForRoleset(lcFrameName, retroMappings));
289                                     //    }
290                                 }
291                             }
292 
293                             Date date = format.parse(cDate);
294 
295                             Match definition = JOOX.$(element.getElementsByTagName("definition"));
296                             String defText = Jsoup.parse(definition.text()).text().trim();
297 
298                             Match elements = JOOX.$(element).children("semType");
299                             addSemTypes(elements, semTypesFreq, semTypesForFrame, frameURI, frameURI);
300 
301                             URI cbyURI = addCBy(cBy);
302 
303                             addStatementToSink(frameURI, RDF.TYPE, PMOFN.FRAME);
304                             addStatementToSink(frameURI, RDFS.LABEL, frameName, false);
305                             addStatementToSink(frameURI, DCTERMS.CREATOR, cbyURI);
306                             addStatementToSink(frameURI, DCTERMS.CREATED, date);
307                             addStatementToSink(frameURI, DCTERMS.IDENTIFIER, Integer.parseInt(identifier));
308                             addStatementToSink(frameURI, SKOS.DEFINITION, defText);
309 
310                             HashSet<String> FEs = new HashSet<>();
311                             final Match fes = JOOX.$(element.getElementsByTagName("FE"));
312                             for (Element fe : fes) {
313 
314                                 String feName = fe.getAttribute("name");
315                                 if (FEs.contains(feName)) {
316                                     continue;
317                                 }
318                                 String lcFeName = feName.toLowerCase();
319                                 if (retroMappings != null) {
320                                     String completeRole = lcFrameName + argumentSeparator + lcFeName;
321                                     if (!added.contains(completeRole)) {
322                                         String toMap = changed.get(completeRole);
323                                         mapRoleCount++;
324                                         URI argumentURI = uriForArgument(lcFrameName, lcFeName);
325                                         if (toMap != null) {
326                                             String[] parts = toMap.split("@");
327                                             URI oldFrameURI = uriForRoleset(parts[0], retroMappings);
328                                             URI oldArgumentURI = uriForArgument(parts[0], parts[1], retroMappings);
329                                             addMappings(frameURI, oldFrameURI, null, null, argumentURI, oldArgumentURI);
330                                             //    addSingleMapping(classMappingURI, prefix, DEFAULT_ARG_SUFFIX,
331                                             //            uriForArgument(lcFrameName, lcFeName),
332                                             //            uriForArgument(parts[0], parts[1], retroMappings));
333                                         } else {
334                                             String f = lcFrameName;
335                                             if (changed.containsKey(f)) {
336                                                 f = changed.get(f);
337                                             }
338                                             URI oldFrameURI = uriForRoleset(f, retroMappings);
339                                             URI oldArgumentURI = uriForArgument(f, lcFeName, retroMappings);
340                                             addMappings(frameURI, oldFrameURI, null, null, argumentURI, oldArgumentURI);
341                                             //    addSingleMapping(classMappingURI, prefix, DEFAULT_ARG_SUFFIX,
342                                             //            uriForArgument(lcFrameName, lcFeName),
343                                             //            uriForArgument(f, lcFeName, retroMappings));
344                                         }
345                                     }
346                                 }
347 
348                                 FEs.add(feName);
349                                 String coreType = fe.getAttribute("coreType");
350                                 String feCBy = fe.getAttribute("cBy");
351                                 String feCDate = fe.getAttribute("cDate");
352                                 String feIdentifier = fe.getAttribute("ID");
353                                 String abbrev = fe.getAttribute("abbrev");
354 
355                                 Date feDate = format.parse(feCDate);
356 
357                                 Match feDefinition = JOOX.$(fe.getElementsByTagName("definition"));
358                                 String feDefText = Jsoup.parse(feDefinition.text()).text().trim();
359 
360                                 URI argumentURI = uriForArgument(frameName.toLowerCase(), feName.toLowerCase());
361                                 addStatementToSink(argumentURI, RDF.TYPE, PMOFN.FRAME_ELEMENT);
362                                 switch (coreType) {
363                                 case "Core":
364                                     addStatementToSink(argumentURI, RDF.TYPE, PMOFN.CORE_FRAME_ELEMENT);
365                                     break;
366                                 case "Peripheral":
367                                     addStatementToSink(argumentURI, RDF.TYPE, PMOFN.PERIPHERAL_FRAME_ELEMENT);
368                                     break;
369                                 case "Extra-Thematic":
370                                     addStatementToSink(argumentURI, RDF.TYPE, PMOFN.EXTRA_THEMATIC_FRAME_ELEMENT);
371                                     break;
372                                 case "Core-Unexpressed":
373                                     addStatementToSink(argumentURI, RDF.TYPE,
374                                             PMOFN.CORE_UNEXPRESSED_FRAME_ELEMENT);
375                                     break;
376                                 }
377 
378                                 URI feCByURI = addCBy(feCBy);
379 
380                                 addStatementToSink(argumentURI, RDFS.LABEL, feName, false);
381                                 addStatementToSink(argumentURI, DCTERMS.CREATOR, feCByURI);
382                                 addStatementToSink(argumentURI, DCTERMS.CREATED, feDate);
383                                 addStatementToSink(argumentURI, DCTERMS.IDENTIFIER, Integer.parseInt(feIdentifier));
384                                 addStatementToSink(argumentURI, SKOS.DEFINITION, feDefText);
385                                 addStatementToSink(argumentURI, PMO.ABBREVIATION, abbrev, false);
386                                 addStatementToSink(frameURI, PMO.SEM_ROLE, argumentURI);
387 
388                                 Match subElems;
389 
390                                 subElems = JOOX.$(fe.getElementsByTagName("semType"));
391                                 addSemTypes(subElems, semTypesFreq, semTypesForFrame, argumentURI, frameURI);
392 
393                                 subElems = JOOX.$(fe.getElementsByTagName("excludesFE"));
394                                 for (Element subElem : subElems) {
395                                     String seName = subElem.getAttribute("name");
396                                     URI subElURI = uriForArgument(frameName.toLowerCase(), seName.toLowerCase());
397                                     addStatementToSink(argumentURI, PMOFN.EXCLUDES_FRAME_ELEMENT, subElURI);
398                                 }
399 
400                                 subElems = JOOX.$(fe.getElementsByTagName("requiresFE"));
401                                 for (Element subElem : subElems) {
402                                     String seName = subElem.getAttribute("name");
403                                     URI subElURI = uriForArgument(frameName.toLowerCase(), seName.toLowerCase());
404                                     addStatementToSink(argumentURI, PMOFN.REQUIRES_FRAME_ELEMENT, subElURI);
405                                 }
406                             }
407 
408                             final Match fecs = JOOX.$(element.getElementsByTagName("FEcoreSet"));
409                             int coreset = 0;
410                             for (Element members : fecs) {
411                                 coreset++;
412 
413                                 URI coresetURI = createURI(frameURI.toString() + "_coreSet" + coreset);
414                                 addStatementToSink(frameURI, PMOFN.FE_CORE_SET_P, coresetURI);
415                                 addStatementToSink(coresetURI, RDF.TYPE, PMOFN.FE_CORE_SET_C);
416 
417                                 Match memberFEs = JOOX.$(members.getElementsByTagName("memberFE"));
418                                 for (Element memberFE : memberFEs) {
419                                     String mName = memberFE.getAttribute("name");
420 
421                                     // todo: the URI of a coreset item is the same as the role?
422                                     URI itemURI = uriForArgument(frameName.toLowerCase(), mName.toLowerCase());
423                                     addStatementToSink(coresetURI, PMO.ITEM, itemURI);
424                                 }
425 
426                             }
427 
428                             final Match frameRelations = JOOX.$(element.getElementsByTagName("frameRelation"));
429                             for (Element frameRelation : frameRelations) {
430                                 Match relatedFrames = JOOX.$(frameRelation.getElementsByTagName("relatedFrame"));
431                                 String type = frameRelation.getAttribute("type");
432 
433                                 URI typeURI = null;
434                                 switch (type) {
435                                 case "Inherits from":
436                                     typeURI = PMOFN.INHERITS_FROM;
437                                     break;
438                                 case "Is Causative of":
439                                     typeURI = PMOFN.IS_CAUSATIVE_OF;
440                                     break;
441                                 case "Is Inchoative of":
442                                     typeURI = PMOFN.IS_INCHOATIVE_OF;
443                                     break;
444                                 case "Perspective on":
445                                     typeURI = PMOFN.PERSPECTIVE_ON;
446                                     break;
447                                 case "Precedes":
448                                     typeURI = PMOFN.PRECEDES;
449                                     break;
450                                 case "See also":
451                                     typeURI = PMOFN.SEE_ALSO;
452                                     break;
453                                 case "Subframe of":
454                                     typeURI = PMOFN.SUBFRAME_OF;
455                                     break;
456                                 case "Uses":
457                                     typeURI = PMOFN.USES;
458                                     break;
459                                 }
460 
461                                 if (typeURI == null) {
462                                     continue;
463                                 }
464 
465                                 for (Element relatedFrame : relatedFrames) {
466                                     String relatedFrameName = relatedFrame.getTextContent();
467 
468                                     if (bugMap.contains(relatedFrameName)) {
469                                         continue;
470                                     }
471                                     addStatementToSink(frameURI, typeURI,
472                                             uriForRoleset(relatedFrameName.toLowerCase()));
473                                     typesFreqs.add(typeURI);
474                                 }
475 
476                             }
477 
478                             final Match lexUnits = JOOX.$(element.getElementsByTagName("lexUnit"));
479                             for (Element lexUnit : lexUnits) {
480                                 luCount++;
481                                 String leCBy = lexUnit.getAttribute("cBy");
482                                 String leCDate = lexUnit.getAttribute("cDate");
483                                 String leIdentifier = lexUnit.getAttribute("ID");
484                                 String incorporatedFE = lexUnit.getAttribute("incorporatedFE");
485 
486                                 // todo: name non รจ mai usato?
487                                 String name = lexUnit.getAttribute("name");
488 
489                                 String status = lexUnit.getAttribute("status");
490                                 String pos = lexUnit.getAttribute("POS").toLowerCase();
491                                 String leDefinition = JOOX.$(lexUnit.getElementsByTagName("definition")).text();
492 
493                                 // Lemmas
494                                 StringBuilder builder = new StringBuilder();
495                                 Match lexemes = JOOX.$(lexUnit).children("lexeme");
496 
497                                 List<String> lexemeList = new ArrayList<>();
498                                 List<String> posList = new ArrayList<>();
499                                 for (Element lexeme : lexemes) {
500                                     String lemmaName = lexeme.getAttribute("name");
501                                     String lemmaPos = lexeme.getAttribute("POS");
502 
503                                     // todo: not used
504 //                                    String headWord = lexeme.getAttribute("headword");
505 //                                    if (headWord != null && headWord.equals("true")) {
506 //
507 //                                    }
508 
509                                     lexemeList.add(lemmaName);
510                                     posList.add(lemmaPos);
511                                 }
512 
513                                 String goodLemma = String.join(" ", lexemeList);
514                                 String uriLemma = String.join("+", lexemeList);
515                                 URI lexicalEntryURI = addLexicalEntry(goodLemma, uriLemma, lexemeList, posList, pos, getLexicon());
516                                 URI luURI = getLuURI(pos, uriLemma, frameName.toLowerCase());
517 
518 //                                String origLemma = null;
519 //                                for (Element lexeme : lexemes) {
520 //                                    String lemmaName = lexeme.getAttribute("name");
521 //                                    String headWord = lexeme.getAttribute("headword");
522 //                                    if (headWord != null && headWord.equals("true")) {
523 //                                        origLemma = lemmaName;
524 //                                    }
525 //                                    builder.append(lemmaName).append(" ");
526 //                                }
527 //                                String lemma = builder.toString().trim().replaceAll("\\s+", "_");
528 //                                if (origLemma == null) {
529 //                                    origLemma = lemma;
530 //                                }
531 
532 //                                URI lexicalEntryURI = addLexicalEntry(origLemma, lemma, pos, getLexicon());
533 //                                URI luURI = getLuURI(pos, lemma, frameName.toLowerCase());
534 
535                                 lus.put(leIdentifier, luURI);
536 
537                                 Match stElements = JOOX.$(lexUnit).children("semType");
538                                 addSemTypes(stElements, semTypesFreq, semTypesForFrame, luURI, frameURI);
539 
540                                 addStatementToSink(luURI, RDF.TYPE, PMOFN.LEXICAL_UNIT);
541                                 addStatementToSink(luURI, PMO.EVOKED_CONCEPT, frameURI);
542                                 addStatementToSink(luURI, PMO.EVOKING_ENTRY, lexicalEntryURI);
543                                 addStatementToSink(luURI, LEXINFO.PART_OF_SPEECH_P, getPosURI(pos));
544                                 addStatementToSink(luURI, DCTERMS.IDENTIFIER, Integer.parseInt(leIdentifier));
545                                 addStatementToSink(luURI, SKOS.DEFINITION, leDefinition);
546                                 addStatementToSink(luURI, RDFS.LABEL, name, false);
547 
548                                 URI statusURI = getStatusURI(status);
549                                 addStatementToSink(luURI, PMOFN.STATUS, statusURI);
550 
551                                 URI leCByURI = addCBy(leCBy);
552                                 Date leDate = format.parse(leCDate);
553 
554                                 addStatementToSink(luURI, DCTERMS.CREATOR, leCByURI);
555                                 addStatementToSink(luURI, DCTERMS.CREATED, leDate);
556                                 addStatementToSink(lexicalEntryURI, ONTOLEX.EVOKES, frameURI);
557 
558                                 //    for (String fe : FEs) {
559                                 //        URI argumentURI = uriForArgument(frameName.toLowerCase(), fe.toLowerCase());
560                                 //        URI conceptualizationURI = uriForConceptualization(uriLemma, pos,
561                                 //                frameName.toLowerCase(), fe.toLowerCase());
562                                 //        addStatementToSink(conceptualizationURI, RDF.TYPE, PMO.CONCEPTUALIZATION);
563                                 //        addStatementToSink(conceptualizationURI, PMO.EVOKED_CONCEPT, argumentURI);
564                                 //        addStatementToSink(conceptualizationURI, PMO.EVOKING_ENTRY, lexicalEntryURI);
565                                 //        addStatementToSink(lexicalEntryURI, ONTOLEX.EVOKES, argumentURI);
566                                 //    }
567 
568                                 if (incorporatedFE != null && incorporatedFE.trim().length() > 0) {
569                                     incorporatedFE = incorporatedFE.trim();
570                                     if (FEs.contains(incorporatedFE)) {
571                                         URI argumentURI = uriForArgument(frameName.toLowerCase(),
572                                                 incorporatedFE.toLowerCase());
573                                         addStatementToSink(luURI, PMOFN.INCORPORATED_FRAME_ELEMENT, argumentURI);
574                                     }
575                                 }
576                             }
577 
578                         }
579 
580                     } catch (final Exception ex) {
581                         throw new IOException(ex);
582                     }
583                 }
584             }
585 
586             LOGGER.info("Extracted {} lexical units", luCount);
587             LOGGER.info("Extracted {} class mappings", mapCount);
588             LOGGER.info("Extracted {} role mappings", mapRoleCount);
589 
590             int semTypesCount = 0;
591             for (URI uri : semTypesFreq.keySet()) {
592                 semTypesCount += semTypesFreq.get(uri);
593             }
594             LOGGER.info("Extracted {} semantic types", semTypesCount);
595 
596 //            for (URI uri : typesFreqs.keySet()) {
597 //                LOGGER.info("{} --> {}", uri, typesFreqs.get(uri));
598 //            }
599 //            for (URI uri : typesFreqsFER.keySet()) {
600 //                LOGGER.info("{} --> {}", uri, typesFreqsFER.get(uri));
601 //            }
602 //            for (URI uri : semTypesFreq.keySet()) {
603 //                LOGGER.info("{} --> {}", uri, semTypesFreq.get(uri));
604 //            }
605 
606 //            int total = 0;
607 //            Iterator<Map.Entry<URI, Integer>> iterator = semTypesFreq.getSorted().iterator();
608 //            while (iterator.hasNext()) {
609 //                URI uri = iterator.next().getKey();
610 //                LOGGER.info("{} --> {}", uri, semTypesFreq.get(uri));
611 //                total += semTypesFreq.get(uri);
612 //            }
613 //            LOGGER.info("Total: {}", total);
614 
615             if (extractExamples) {
616 
617                 int totalCount = 0;
618                 int skippedCount = 0;
619 
620                 LOGGER.info("Extracting examples");
621                 for (final File file : Files.fileTreeTraverser().preOrderTraversal(paths.get("lu"))) {
622                     if (!file.isDirectory() && file.getName().endsWith(".xml")) {
623                         LOGGER.debug("Processing {} ...", file);
624 
625                         try {
626                             document = dbf.newDocumentBuilder().parse(file);
627                             final Match lexUnits = JOOX.$(document.getElementsByTagName("lexUnit"));
628                             String frameName = lexUnits.attr("frame");
629                             String luID = lexUnits.attr("ID");
630 
631                             // todo: check this
632                             if (lus.get(luID) == null) {
633                                 LOGGER.error("LU {} is not present in Map", luID);
634                                 continue;
635                             }
636 
637                             URI frameURI = uriForRoleset(frameName.toLowerCase());
638                             URI luURI = lus.get(luID);
639 
640                             final Match examples = JOOX.$(document.getElementsByTagName("sentence"));
641 
642                             for (Element example : examples) {
643 
644                                 synchronized (this) {
645 
646                                     // Create temporary sink
647                                     // todo: the absence of continue/break must be check for this to work
648 
649                                     Collection<Statement> tempStatements = new ArrayList<>();
650                                     RDFHandler tempSink = RDFHandlers.wrap(tempStatements);
651                                     boolean keep = true;
652                                     setSink(tempSink);
653 
654                                     // Load example
655 
656                                     boolean hasTarget = false;
657                                     URI asURI = null;
658                                     totalCount++;
659 
660                                     String id = example.getAttribute("ID");
661                                     URI exampleURI = uriForExample(id);
662 
663                                     String text = JOOX.$(example.getElementsByTagName("text")).text();
664 
665                                     Match layers = JOOX.$(example.getElementsByTagName("layer"));
666 
667                                     Set<Integer> starts = new HashSet<>();
668                                     Set<Integer> ends = new HashSet<>();
669 
670                                     Matcher matcher = TOKEN_REGEX.matcher(text);
671 
672                                     while (matcher.find()) {
673                                         starts.add(matcher.start());
674                                         ends.add(matcher.end() - 1);
675                                     }
676 
677                                     if (starts.size() == 0 || ends.size() == 0) {
678                                         LOGGER.error("A set is empty");
679                                         keep = false;
680                                     }
681 
682                                     // Loop for target
683                                     for (Element layer : layers) {
684                                         String layerName = layer.getAttribute("name");
685 
686                                         if (layerName == null) {
687                                             continue;
688                                         }
689 
690                                         if (layerName.equals("Target")) {
691                                             Match labels = JOOX.$(layer.getElementsByTagName("label"));
692 
693                                             Integer targetStart = null;
694                                             Integer targetEnd = null;
695 
696                                             for (Element label : labels) {
697 
698                                                 Integer start = null;
699                                                 Integer end = null;
700                                                 try {
701                                                     start = Integer.parseInt(label.getAttribute("start"));
702                                                     end = Integer.parseInt(label.getAttribute("end"));
703                                                 } catch (Exception e) {
704                                                     // ignored
705                                                 }
706 
707                                                 if (start != null && !starts.contains(start)) {
708                                                     LOGGER.debug("Error in start index, skipping ({} - {})", luID,
709                                                             text);
710                                                     continue;
711                                                 }
712                                                 if (end != null && !ends.contains(end)) {
713                                                     LOGGER.debug("Error in end index, skipping ({} - {})", luID, text);
714                                                     continue;
715                                                 }
716 
717                                                 if (start != null) {
718                                                     if (targetStart == null || targetStart > start) {
719                                                         targetStart = start;
720                                                     }
721                                                 }
722                                                 if (end != null) {
723                                                     if (targetEnd == null || targetEnd < end) {
724                                                         targetEnd = end;
725                                                     }
726                                                 }
727                                             }
728 
729                                             if (targetStart == null) {
730                                                 LOGGER.debug("Target start is null");
731                                                 continue;
732                                             }
733                                             if (targetEnd == null) {
734                                                 LOGGER.debug("Target end is null");
735                                                 continue;
736                                             }
737 
738                                             hasTarget = true;
739                                             asURI = createURI(exampleURI + "_annSet_" + targetStart);
740                                             addStatementToSink(asURI, RDF.TYPE, PMO.ANNOTATION_SET, EXAMPLE_GRAPH);
741 
742                                             URI markableURI = uriForMarkable(exampleURI, targetStart, targetEnd);
743                                             String anchor = text.substring(targetStart, targetEnd);
744 
745                                             URI aURI = createURI(asURI + "_pred");
746                                             addStatementToSink(aURI, RDF.TYPE, NIF.ANNOTATION_C, EXAMPLE_GRAPH);
747                                             addStatementToSink(asURI, PMO.ITEM, aURI, EXAMPLE_GRAPH);
748                                             addStatementToSink(aURI, PMO.VALUE_OBJ, frameURI, EXAMPLE_GRAPH);
749                                             addStatementToSink(aURI, PMO.VALUE_OBJ, lus.get(luID), EXAMPLE_GRAPH);
750                                             addStatementToSink(exampleURI, NIF.ANNOTATION_P, aURI, EXAMPLE_GRAPH);
751 
752                                             addStatementToSink(markableURI, RDF.TYPE, PMO.MARKABLE,
753                                                     EXAMPLE_GRAPH);
754                                             addStatementToSink(markableURI, NIF.ANCHOR_OF, anchor, EXAMPLE_GRAPH);
755 //                                            addStatementToSink(markableURI, NIF.ANNOTATION_P, frameURI);
756 //                                            addStatementToSink(markableURI, NIF.ANNOTATION_P, luURI);
757                                             addStatementToSink(markableURI, NIF.ANNOTATION_P, aURI, EXAMPLE_GRAPH);
758                                             addStatementToSink(markableURI, NIF.BEGIN_INDEX, targetStart,
759                                                     EXAMPLE_GRAPH);
760                                             addStatementToSink(markableURI, NIF.END_INDEX, targetEnd, EXAMPLE_GRAPH);
761                                             addStatementToSink(markableURI, NIF.REFERENCE_CONTEXT, exampleURI,
762                                                     EXAMPLE_GRAPH);
763                                         }
764                                     }
765 
766                                     if (!hasTarget) {
767                                         LOGGER.debug("Skipped example: {} in {}", id, luID);
768                                         keep = false;
769                                     }
770 
771                                     addStatementToSink(exampleURI, RDF.TYPE, PMO.EXAMPLE, EXAMPLE_GRAPH);
772                                     addStatementToSink(exampleURI, NIF.IS_STRING, text, EXAMPLE_GRAPH);
773 //                                    addStatementToSink(frameURI, PMO.EXAMPLE_P, exampleURI);
774 //                                    addStatementToSink(luURI, PMO.EXAMPLE_P, exampleURI);
775 
776                                     // Loop for FE
777                                     int i = 0;
778                                     if (hasTarget) {
779                                         for (Element layer : layers) {
780                                             String layerName = layer.getAttribute("name");
781 
782                                             if (layerName == null) {
783                                                 continue;
784                                             }
785 
786                                             if (layerName.equals("FE")) {
787                                                 Match labels = JOOX.$(layer.getElementsByTagName("label"));
788                                                 for (Element label : labels) {
789                                                     String roleName = label.getAttribute("name");
790                                                     URI argumentURI = uriForArgument(frameName.toLowerCase(),
791                                                             roleName.toLowerCase());
792 
793                                                     String anchor = null;
794 
795                                                     Integer start = null;
796                                                     Integer end = null;
797                                                     try {
798                                                         start = Integer.parseInt(label.getAttribute("start"));
799                                                         end = Integer.parseInt(label.getAttribute("end"));
800 
801                                                         if (start + end > 0) {
802                                                             anchor = text.substring(start, end);
803                                                         }
804                                                     } catch (Exception e) {
805                                                         // ignored
806                                                     }
807 
808                                                     if (start != null && !starts.contains(start)) {
809                                                         LOGGER.debug("Error in start index, skipping ({} - {})", luID,
810                                                                 text);
811                                                         keep = false;
812                                                         continue;
813                                                     }
814                                                     if (end != null && !ends.contains(end)) {
815                                                         LOGGER.debug("Error in end index, skipping ({} - {})", luID,
816                                                                 text);
817                                                         keep = false;
818                                                         continue;
819                                                     }
820 
821                                                     i++;
822 
823                                                     URI aURI = createURI(asURI + "_arg" + i);
824                                                     addStatementToSink(asURI, PMO.ITEM, aURI, EXAMPLE_GRAPH);
825                                                     addStatementToSink(aURI, RDF.TYPE, NIF.ANNOTATION_C, EXAMPLE_GRAPH);
826                                                     addStatementToSink(aURI, PMO.VALUE_OBJ, argumentURI, EXAMPLE_GRAPH);
827                                                     addStatementToSink(exampleURI, NIF.ANNOTATION_P, aURI,
828                                                             EXAMPLE_GRAPH);
829 
830                                                     if (anchor == null) {
831                                                         // addStatementToSink(aURI, RDF.TYPE, PMO.IMPLICIT_ANNOTATION, EXAMPLE_GRAPH);
832                                                     } else {
833 
834                                                         URI markableURI = uriForMarkable(exampleURI, start, end);
835 
836                                                         addStatementToSink(markableURI, RDF.TYPE, PMO.MARKABLE,
837                                                                 EXAMPLE_GRAPH);
838                                                         addStatementToSink(markableURI, NIF.ANCHOR_OF, anchor,
839                                                                 EXAMPLE_GRAPH);
840                                                         addStatementToSink(markableURI, NIF.ANNOTATION_P, aURI,
841                                                                 EXAMPLE_GRAPH);
842                                                         addStatementToSink(markableURI, NIF.BEGIN_INDEX, start,
843                                                                 EXAMPLE_GRAPH);
844                                                         addStatementToSink(markableURI, NIF.END_INDEX, end,
845                                                                 EXAMPLE_GRAPH);
846                                                         addStatementToSink(markableURI, NIF.REFERENCE_CONTEXT,
847                                                                 exampleURI, EXAMPLE_GRAPH);
848                                                     }
849                                                 }
850                                             }
851                                         }
852                                     }
853 
854                                     setDefaultSinkAsSink();
855 
856                                     if (!keep) {
857                                         skippedCount++;
858                                         continue;
859                                     }
860 
861                                     for (Statement statement : tempStatements) {
862                                         addStatementToSink(statement);
863                                     }
864                                 }
865                             }
866 
867                             // As a security measure
868                             setDefaultSinkAsSink();
869 
870                         } catch (final Exception ex) {
871                             throw new IOException(ex);
872                         }
873                     }
874                 }
875 
876                 LOGGER.info("Extracted examples: {}/{}", totalCount - skippedCount, totalCount);
877             }
878 
879         } catch (final Exception ex) {
880             throw new IOException(ex);
881         }
882     }
883 
884     private void addSemTypes(Match stElements, FrequencyHashSet<URI> semTypesFreq,
885             FrequencyHashSet<URI> semTypesForFrame, URI baseURI, URI frameURI) {
886         for (Element stElement : stElements) {
887             String LUSemType = stElement.getAttribute("name");
888             URI LUSemTypeURI = null;
889             if (LUSemType != null) {
890                 LUSemTypeURI = getSemTypeURI(LUSemType);
891             }
892             if (LUSemTypeURI != null) {
893                 addStatementToSink(baseURI, PMOFN.SEM_TYPE_P, LUSemTypeURI);
894                 semTypesFreq.add(LUSemTypeURI);
895                 semTypesForFrame.add(frameURI);
896             }
897         }
898     }
899 
900     private URI addCBy(String cBy) {
901         URI cbyURI = uriForCBy(cBy);
902         addStatementToSink(cbyURI, DCTERMS.IDENTIFIER, cBy, false);
903         return cbyURI;
904     }
905 
906     private URI uriForCBy(String cBy) {
907         StringBuilder builder = new StringBuilder();
908         builder.append(NAMESPACE);
909         builder.append(cBy.toLowerCase());
910         builder.append("_Creator");
911         return createURI(builder.toString());
912     }
913 
914     private URI uriForExample(String exampleID) {
915         StringBuilder builder = new StringBuilder();
916         builder.append(NAMESPACE);
917         builder.append(prefix).append(separator);
918         builder.append("example_");
919         builder.append(exampleID);
920         return createURI(builder.toString());
921     }
922 
923     private void addSemTypeToSink(Element semType) {
924         String name = semType.getAttribute("name");
925         String abbrev = semType.getAttribute("abbrev");
926         String id = semType.getAttribute("ID");
927 
928         String definition = JOOX.$(semType.getElementsByTagName("definition")).text();
929         String supID = JOOX.$(semType.getElementsByTagName("superType")).attr("supID");
930         String superTypeName = JOOX.$(semType.getElementsByTagName("superType")).attr("superTypeName");
931 
932         URI semTypeURI = getSemTypeURI(name);
933 
934         addStatementToSink(semTypeURI, RDF.TYPE, PMOFN.SEM_TYPE_C);
935         addStatementToSink(semTypeURI, DCTERMS.IDENTIFIER, Integer.parseInt(id));
936         addStatementToSink(semTypeURI, RDFS.LABEL, name, false);
937         addStatementToSink(semTypeURI, SKOS.DEFINITION, definition);
938         addStatementToSink(semTypeURI, PMO.ABBREVIATION, abbrev, false);
939 
940         if (supID != null) {
941             URI superSemTypeURI = getSemTypeURI(superTypeName);
942             addStatementToSink(semTypeURI, PMOFN.SUB_TYPE_OF, superSemTypeURI);
943         }
944     }
945 
946     private URI getSemTypeURI(String name) {
947         return getSemTypeURI(name, null);
948     }
949 
950     private URI getSemTypeURI(String name, @Nullable String prefix) {
951         if (prefix == null) {
952             prefix = this.prefix;
953         }
954 
955         name = name.toLowerCase();
956         name = name.replaceAll("-", "_");
957         name = name.replaceAll("\\s+", "_");
958 
959         StringBuilder builder = new StringBuilder();
960         builder.append(NAMESPACE);
961         builder.append(prefix);
962         builder.append(separator);
963         builder.append(name);
964         builder.append("_semType");
965         return createURI(builder.toString());
966     }
967 
968     private void addStatusToSink(Element statusType) {
969         String name = statusType.getAttribute("name");
970         String description = statusType.getAttribute("description");
971 
972         URI statusURI = getStatusURI(name);
973         addStatementToSink(statusURI, RDF.TYPE, PMOFN.LUSTATUS);
974         addStatementToSink(statusURI, RDFS.LABEL, name, false);
975         addStatementToSink(statusURI, SKOS.DEFINITION, description);
976     }
977 
978     private URI getLuURI(String pos, String luName, String frameName) {
979         return getLuURI(pos, luName, frameName, null);
980     }
981 
982     private URI getLuURI(String pos, String luName, String frameName, @Nullable String prefix) {
983         if (prefix == null) {
984             prefix = this.prefix;
985         }
986         StringBuilder builder = new StringBuilder();
987         builder.append(NAMESPACE);
988         builder.append(CONCEPTUALIZATION_PREFIX);
989         builder.append(separator);
990         
991         
992         
993         // builder.append(pos); // FIXME should normalize pos as below!!
994         builder.append(LEXINFO.map.get(getPosURI(pos)));
995         
996         builder.append(separator);
997         builder.append(luName.replaceAll("[^a-zA-Z0-9-_+]", ""));
998         builder.append(separator);
999         builder.append(rolesetPart(frameName, prefix));
1000         return createURI(builder.toString());
1001     }
1002 
1003     private URI getStatusURI(String name) {
1004         return getStatusURI(name, null);
1005     }
1006 
1007     private URI getStatusURI(String name, @Nullable String prefix) {
1008         if (prefix == null) {
1009             prefix = this.prefix;
1010         }
1011         StringBuilder builder = new StringBuilder();
1012         builder.append(NAMESPACE);
1013         builder.append(prefix);
1014         builder.append(separator);
1015         builder.append(name.toLowerCase());
1016         builder.append("_LUStatus");
1017         return createURI(builder.toString());
1018     }
1019 
1020     protected URI getPosURI(String pos) {
1021 
1022         pos = pos.toUpperCase();
1023 
1024         switch (pos) {
1025         case "A":
1026             return LEXINFO.ADJECTIVE;
1027         case "ADV":
1028             return LEXINFO.ADVERB;
1029         case "ART":
1030             return LEXINFO.DETERMINER;
1031         case "C":
1032             return LEXINFO.CONJUNCTION;
1033         case "INTJ":
1034             return LEXINFO.INTERJECTION;
1035         case "N":
1036             return LEXINFO.NOUN;
1037         case "NUM":
1038             return LEXINFO.CARDINAL_NUMERAL;
1039         case "PREP":
1040             return LEXINFO.PREPOSITION;
1041         case "PRON":
1042             return LEXINFO.PRONOUN;
1043         case "SCON":
1044             return LEXINFO.SUBORDINATING_CONJUNCTION;
1045         case "V":
1046             return LEXINFO.VERB;
1047         case "IDIO":
1048             return PMO.IDIOSYNCRATIC;
1049         case "AVP":
1050             return LEXINFO.PARTICLE;
1051         }
1052 
1053         return null;
1054     }
1055 
1056     @Override public String getArgLabel() {
1057         return "";
1058     }
1059 
1060     @Override protected String formatArg(String arg) {
1061         return super.formatArg(arg).toLowerCase();
1062     }
1063 }