1 package eu.fbk.dkm.premon.premonitor;
2
3 import com.google.common.io.Files;
4 import eu.fbk.dkm.premon.vocab.*;
5 import eu.fbk.dkm.utils.FrequencyHashSet;
6 import eu.fbk.rdfpro.RDFHandlers;
7 import org.joox.JOOX;
8 import org.joox.Match;
9 import org.jsoup.Jsoup;
10 import org.openrdf.model.Statement;
11 import org.openrdf.model.URI;
12 import org.openrdf.model.vocabulary.DCTERMS;
13 import org.openrdf.model.vocabulary.RDF;
14 import org.openrdf.model.vocabulary.RDFS;
15 import org.openrdf.model.vocabulary.SKOS;
16 import org.openrdf.rio.RDFHandler;
17 import org.slf4j.Logger;
18 import org.slf4j.LoggerFactory;
19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
21
22 import javax.annotation.Nullable;
23 import javax.xml.parsers.DocumentBuilderFactory;
24 import java.io.File;
25 import java.io.IOException;
26 import java.text.DateFormat;
27 import java.text.SimpleDateFormat;
28 import java.util.*;
29 import java.util.regex.Matcher;
30 import java.util.regex.Pattern;
31
32 public class FramenetConverter extends Converter {
33
34 private static final Logger LOGGER = LoggerFactory.getLogger(FramenetConverter.class);
35 HashMap<String, File> paths = new HashMap<>();
36 private String retroMappings = null;
37
38
39 private static final DateFormat format = new SimpleDateFormat("MM/dd/yyyy HH:mm:ss z E", Locale.ENGLISH);
40
41 private static final Pattern TOKEN_REGEX = Pattern.compile("[^\\s]+");
42
43
44 private static final String ONE_FRAME = null;
45 private static final Set<String> bugMap = new HashSet<>();
46
47 private String thisVersion = null;
48
49 public FramenetConverter(File path, RDFHandler sink, Properties properties, Map<String, URI> wnInfo) {
50 super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
51
52 paths.put("frame", new File(this.path.getAbsolutePath() + File.separator + "frame"));
53 paths.put("lu", new File(this.path.getAbsolutePath() + File.separator + "lu"));
54 paths.put("luIndex", new File(this.path.getAbsolutePath() + File.separator + "luIndex.xml"));
55 paths.put("semTypes", new File(this.path.getAbsolutePath() + File.separator + "semTypes.xml"));
56 paths.put("frRelation", new File(this.path.getAbsolutePath() + File.separator + "frRelation.xml"));
57 paths.put("retroMappings",
58 new File(this.path.getAbsolutePath() + File.separator + properties.getProperty("retromapfile")));
59
60 bugMap.add("Test35");
61 bugMap.add("Test_the_test");
62
63
64
65 retroMappings = properties.getProperty("retromappings");
66 thisVersion = properties.getProperty("thisversion", "1.5");
67
68 LOGGER.info("Starting dataset: {}", prefix);
69 }
70
71 @Override public void convert() throws IOException {
72
73 addMetaToSink();
74
75 final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
76
77 for (String key : paths.keySet()) {
78 File value = paths.get(key);
79 if (!value.exists()) {
80 LOGGER.error("Path {} does not exist", value.getAbsolutePath());
81 }
82 }
83
84 try {
85
86 String tagPrevious = "r1-5";
87 String tagCurrent = "r1-6";
88 String fnPrevious = "r1-5_FrameName";
89
90 if (thisVersion.equals("1.7")) {
91 tagPrevious = "r1.6";
92 tagCurrent = "r1.7";
93 fnPrevious = "r1.6_FrameName";
94 }
95
96 Document document;
97 HashSet<String> added = new HashSet<>();
98 HashMap<String, String> changed = new HashMap<>();
99
100
101 if (retroMappings != null) {
102 LOGGER.info("Extracting diff file");
103 document = dbf.newDocumentBuilder().parse(paths.get("retroMappings"));
104
105 Match diffs;
106
107 diffs = JOOX.$(document).xpath("FrameDiff/Added/Frame");
108 for (Element diff : diffs) {
109 added.add(diff.getTextContent().toLowerCase());
110 }
111 diffs = JOOX.$(document).xpath("FrameDiff/Changed/Frame");
112 for (Element diff : diffs) {
113 String r15 = JOOX.$(diff.getElementsByTagName(tagPrevious)).text().toLowerCase();
114 String r16 = JOOX.$(diff.getElementsByTagName(tagCurrent)).text().toLowerCase();
115 changed.put(r16, r15);
116 }
117
118 diffs = JOOX.$(document).xpath("FrameElementDiff/Added/FrameElement");
119 for (Element diff : diffs) {
120 added.add(diff.getAttribute("FrameName").toLowerCase() + argumentSeparator + diff.getTextContent()
121 .toLowerCase());
122 }
123 diffs = JOOX.$(document).xpath("FrameElementDiff/Changed/FrameElement");
124 for (Element diff : diffs) {
125 String f16 = diff.getAttribute("FrameName").toLowerCase();
126 String f15 = diff.getAttribute(fnPrevious);
127 if (f15 == null || f15.length() == 0) {
128 f15 = f16;
129 }
130 f15 = f15.toLowerCase();
131 String r15 = JOOX.$(diff.getElementsByTagName(tagPrevious)).text().toLowerCase();
132 String r16 = JOOX.$(diff.getElementsByTagName(tagCurrent)).text().toLowerCase();
133 changed.put(f16 + argumentSeparator + r16, f15 + argumentSeparator + r15);
134 }
135
136 }
137
138
139 LOGGER.info("Extracting luIndex");
140 document = dbf.newDocumentBuilder().parse(paths.get("luIndex"));
141 Match statusTypes = JOOX.$(document.getElementsByTagName("statusType"));
142 for (Element statusType : statusTypes) {
143 addStatusToSink(statusType);
144 }
145
146
147 LOGGER.info("Extracting semTypes");
148 document = dbf.newDocumentBuilder().parse(paths.get("semTypes"));
149 Match semTypes = JOOX.$(document.getElementsByTagName("semType"));
150 for (Element semType : semTypes) {
151 addSemTypeToSink(semType);
152 }
153
154
155 FrequencyHashSet<URI> typesFreqs = new FrequencyHashSet<>();
156 FrequencyHashSet<URI> typesFreqsFER = new FrequencyHashSet<>();
157
158 LOGGER.info("Extracting frRelations");
159 document = dbf.newDocumentBuilder().parse(paths.get("frRelation"));
160 Match frRelationTypes = JOOX.$(document.getElementsByTagName("frameRelationType"));
161 for (Element frRelationType : frRelationTypes) {
162 String name = frRelationType.getAttribute("name");
163 URI typeURI = null;
164
165 boolean invert = false;
166
167 switch (name) {
168 case "Inheritance":
169 typeURI = PMOFN.INHERITS_FROM;
170 break;
171 case "Subframe":
172 typeURI = PMOFN.SUBFRAME_OF;
173 break;
174 case "Using":
175 typeURI = PMOFN.USES;
176 break;
177 case "See_also":
178 typeURI = PMOFN.SEE_ALSO;
179 invert = true;
180 break;
181 case "Inchoative_of":
182 typeURI = PMOFN.IS_INCHOATIVE_OF;
183 invert = true;
184 break;
185 case "Causative_of":
186 typeURI = PMOFN.IS_CAUSATIVE_OF;
187 invert = true;
188 break;
189 case "Precedes":
190 typeURI = PMOFN.PRECEDES;
191 invert = true;
192 break;
193 case "Perspective_on":
194 typeURI = PMOFN.PERSPECTIVE_ON;
195 break;
196 case "ReFraming_Mapping":
197 typeURI = PMOFN.REFRAME_MAPPING;
198 break;
199 case "Metaphor":
200 typeURI = PMOFN.METAPHOR;
201 break;
202 }
203
204 if (typeURI == null) {
205 LOGGER.error("typeURI is null ({})", name);
206 continue;
207 }
208
209 Match frameRelations = JOOX.$(frRelationType.getElementsByTagName("frameRelation"));
210 for (Element frameRelation : frameRelations) {
211 String subFrameName = frameRelation.getAttribute("subFrameName");
212 String superFrameName = frameRelation.getAttribute("superFrameName");
213
214 if (invert) {
215 addStatementToSink(uriForRoleset(superFrameName.toLowerCase()), typeURI,
216 uriForRoleset(subFrameName.toLowerCase()));
217 } else {
218 addStatementToSink(uriForRoleset(subFrameName.toLowerCase()), typeURI,
219 uriForRoleset(superFrameName.toLowerCase()));
220 }
221 typesFreqs.add(typeURI);
222
223 Match feRelations = JOOX.$(frameRelation.getElementsByTagName("FERelation"));
224 for (Element feRelation : feRelations) {
225
226
227 URI relURI = createURI(typeURI.toString() + "FER");
228
229 URI arg1 = uriForArgument(subFrameName.toLowerCase(), feRelation.getAttribute("subFEName"));
230 URI arg2 = uriForArgument(superFrameName.toLowerCase(), feRelation.getAttribute("superFEName"));
231
232 if (invert) {
233 addStatementToSink(arg2, relURI, arg1);
234 } else {
235 addStatementToSink(arg1, relURI, arg2);
236 }
237 typesFreqsFER.add(relURI);
238 }
239
240 }
241
242 }
243
244
245 LOGGER.info("Extracting frames");
246 int luCount = 0;
247 int mapCount = 0;
248 int mapRoleCount = 0;
249 FrequencyHashSet<URI> semTypesFreq = new FrequencyHashSet<>();
250 FrequencyHashSet<URI> semTypesForFrame = new FrequencyHashSet<>();
251 HashMap<String, URI> lus = new HashMap<>();
252 for (final File file : Files.fileTreeTraverser().preOrderTraversal(paths.get("frame"))) {
253 if (!file.isDirectory() && file.getName().endsWith(".xml")) {
254 LOGGER.debug("Processing {} ...", file);
255
256 if (ONE_FRAME != null) {
257 if (!file.getName().equals(ONE_FRAME)) {
258 continue;
259 }
260 }
261
262 try {
263 document = dbf.newDocumentBuilder().parse(file);
264 final Match frame = JOOX.$(document.getElementsByTagName("frame"));
265
266 for (Element element : frame) {
267 String cBy = frame.attr("cBy");
268 String cDate = frame.attr("cDate");
269 String identifier = frame.attr("ID");
270 String frameName = frame.attr("name");
271
272 String lcFrameName = frameName.toLowerCase();
273
274 URI frameURI = uriForRoleset(lcFrameName);
275
276 URI classMappingURI = null;
277 if (retroMappings != null) {
278 if (!added.contains(lcFrameName)) {
279 String toMap = changed.get(lcFrameName);
280 mapCount++;
281 URI oldFrameURI = uriForRoleset(toMap != null ? toMap : lcFrameName, retroMappings);
282 addMappings(frameURI, oldFrameURI, null, null);
283
284
285
286
287
288
289
290 }
291 }
292
293 Date date = format.parse(cDate);
294
295 Match definition = JOOX.$(element.getElementsByTagName("definition"));
296 String defText = Jsoup.parse(definition.text()).text().trim();
297
298 Match elements = JOOX.$(element).children("semType");
299 addSemTypes(elements, semTypesFreq, semTypesForFrame, frameURI, frameURI);
300
301 URI cbyURI = addCBy(cBy);
302
303 addStatementToSink(frameURI, RDF.TYPE, PMOFN.FRAME);
304 addStatementToSink(frameURI, RDFS.LABEL, frameName, false);
305 addStatementToSink(frameURI, DCTERMS.CREATOR, cbyURI);
306 addStatementToSink(frameURI, DCTERMS.CREATED, date);
307 addStatementToSink(frameURI, DCTERMS.IDENTIFIER, Integer.parseInt(identifier));
308 addStatementToSink(frameURI, SKOS.DEFINITION, defText);
309
310 HashSet<String> FEs = new HashSet<>();
311 final Match fes = JOOX.$(element.getElementsByTagName("FE"));
312 for (Element fe : fes) {
313
314 String feName = fe.getAttribute("name");
315 if (FEs.contains(feName)) {
316 continue;
317 }
318 String lcFeName = feName.toLowerCase();
319 if (retroMappings != null) {
320 String completeRole = lcFrameName + argumentSeparator + lcFeName;
321 if (!added.contains(completeRole)) {
322 String toMap = changed.get(completeRole);
323 mapRoleCount++;
324 URI argumentURI = uriForArgument(lcFrameName, lcFeName);
325 if (toMap != null) {
326 String[] parts = toMap.split("@");
327 URI oldFrameURI = uriForRoleset(parts[0], retroMappings);
328 URI oldArgumentURI = uriForArgument(parts[0], parts[1], retroMappings);
329 addMappings(frameURI, oldFrameURI, null, null, argumentURI, oldArgumentURI);
330
331
332
333 } else {
334 String f = lcFrameName;
335 if (changed.containsKey(f)) {
336 f = changed.get(f);
337 }
338 URI oldFrameURI = uriForRoleset(f, retroMappings);
339 URI oldArgumentURI = uriForArgument(f, lcFeName, retroMappings);
340 addMappings(frameURI, oldFrameURI, null, null, argumentURI, oldArgumentURI);
341
342
343
344 }
345 }
346 }
347
348 FEs.add(feName);
349 String coreType = fe.getAttribute("coreType");
350 String feCBy = fe.getAttribute("cBy");
351 String feCDate = fe.getAttribute("cDate");
352 String feIdentifier = fe.getAttribute("ID");
353 String abbrev = fe.getAttribute("abbrev");
354
355 Date feDate = format.parse(feCDate);
356
357 Match feDefinition = JOOX.$(fe.getElementsByTagName("definition"));
358 String feDefText = Jsoup.parse(feDefinition.text()).text().trim();
359
360 URI argumentURI = uriForArgument(frameName.toLowerCase(), feName.toLowerCase());
361 addStatementToSink(argumentURI, RDF.TYPE, PMOFN.FRAME_ELEMENT);
362 switch (coreType) {
363 case "Core":
364 addStatementToSink(argumentURI, RDF.TYPE, PMOFN.CORE_FRAME_ELEMENT);
365 break;
366 case "Peripheral":
367 addStatementToSink(argumentURI, RDF.TYPE, PMOFN.PERIPHERAL_FRAME_ELEMENT);
368 break;
369 case "Extra-Thematic":
370 addStatementToSink(argumentURI, RDF.TYPE, PMOFN.EXTRA_THEMATIC_FRAME_ELEMENT);
371 break;
372 case "Core-Unexpressed":
373 addStatementToSink(argumentURI, RDF.TYPE,
374 PMOFN.CORE_UNEXPRESSED_FRAME_ELEMENT);
375 break;
376 }
377
378 URI feCByURI = addCBy(feCBy);
379
380 addStatementToSink(argumentURI, RDFS.LABEL, feName, false);
381 addStatementToSink(argumentURI, DCTERMS.CREATOR, feCByURI);
382 addStatementToSink(argumentURI, DCTERMS.CREATED, feDate);
383 addStatementToSink(argumentURI, DCTERMS.IDENTIFIER, Integer.parseInt(feIdentifier));
384 addStatementToSink(argumentURI, SKOS.DEFINITION, feDefText);
385 addStatementToSink(argumentURI, PMO.ABBREVIATION, abbrev, false);
386 addStatementToSink(frameURI, PMO.SEM_ROLE, argumentURI);
387
388 Match subElems;
389
390 subElems = JOOX.$(fe.getElementsByTagName("semType"));
391 addSemTypes(subElems, semTypesFreq, semTypesForFrame, argumentURI, frameURI);
392
393 subElems = JOOX.$(fe.getElementsByTagName("excludesFE"));
394 for (Element subElem : subElems) {
395 String seName = subElem.getAttribute("name");
396 URI subElURI = uriForArgument(frameName.toLowerCase(), seName.toLowerCase());
397 addStatementToSink(argumentURI, PMOFN.EXCLUDES_FRAME_ELEMENT, subElURI);
398 }
399
400 subElems = JOOX.$(fe.getElementsByTagName("requiresFE"));
401 for (Element subElem : subElems) {
402 String seName = subElem.getAttribute("name");
403 URI subElURI = uriForArgument(frameName.toLowerCase(), seName.toLowerCase());
404 addStatementToSink(argumentURI, PMOFN.REQUIRES_FRAME_ELEMENT, subElURI);
405 }
406 }
407
408 final Match fecs = JOOX.$(element.getElementsByTagName("FEcoreSet"));
409 int coreset = 0;
410 for (Element members : fecs) {
411 coreset++;
412
413 URI coresetURI = createURI(frameURI.toString() + "_coreSet" + coreset);
414 addStatementToSink(frameURI, PMOFN.FE_CORE_SET_P, coresetURI);
415 addStatementToSink(coresetURI, RDF.TYPE, PMOFN.FE_CORE_SET_C);
416
417 Match memberFEs = JOOX.$(members.getElementsByTagName("memberFE"));
418 for (Element memberFE : memberFEs) {
419 String mName = memberFE.getAttribute("name");
420
421
422 URI itemURI = uriForArgument(frameName.toLowerCase(), mName.toLowerCase());
423 addStatementToSink(coresetURI, PMO.ITEM, itemURI);
424 }
425
426 }
427
428 final Match frameRelations = JOOX.$(element.getElementsByTagName("frameRelation"));
429 for (Element frameRelation : frameRelations) {
430 Match relatedFrames = JOOX.$(frameRelation.getElementsByTagName("relatedFrame"));
431 String type = frameRelation.getAttribute("type");
432
433 URI typeURI = null;
434 switch (type) {
435 case "Inherits from":
436 typeURI = PMOFN.INHERITS_FROM;
437 break;
438 case "Is Causative of":
439 typeURI = PMOFN.IS_CAUSATIVE_OF;
440 break;
441 case "Is Inchoative of":
442 typeURI = PMOFN.IS_INCHOATIVE_OF;
443 break;
444 case "Perspective on":
445 typeURI = PMOFN.PERSPECTIVE_ON;
446 break;
447 case "Precedes":
448 typeURI = PMOFN.PRECEDES;
449 break;
450 case "See also":
451 typeURI = PMOFN.SEE_ALSO;
452 break;
453 case "Subframe of":
454 typeURI = PMOFN.SUBFRAME_OF;
455 break;
456 case "Uses":
457 typeURI = PMOFN.USES;
458 break;
459 }
460
461 if (typeURI == null) {
462 continue;
463 }
464
465 for (Element relatedFrame : relatedFrames) {
466 String relatedFrameName = relatedFrame.getTextContent();
467
468 if (bugMap.contains(relatedFrameName)) {
469 continue;
470 }
471 addStatementToSink(frameURI, typeURI,
472 uriForRoleset(relatedFrameName.toLowerCase()));
473 typesFreqs.add(typeURI);
474 }
475
476 }
477
478 final Match lexUnits = JOOX.$(element.getElementsByTagName("lexUnit"));
479 for (Element lexUnit : lexUnits) {
480 luCount++;
481 String leCBy = lexUnit.getAttribute("cBy");
482 String leCDate = lexUnit.getAttribute("cDate");
483 String leIdentifier = lexUnit.getAttribute("ID");
484 String incorporatedFE = lexUnit.getAttribute("incorporatedFE");
485
486
487 String name = lexUnit.getAttribute("name");
488
489 String status = lexUnit.getAttribute("status");
490 String pos = lexUnit.getAttribute("POS").toLowerCase();
491 String leDefinition = JOOX.$(lexUnit.getElementsByTagName("definition")).text();
492
493
494 StringBuilder builder = new StringBuilder();
495 Match lexemes = JOOX.$(lexUnit).children("lexeme");
496
497 List<String> lexemeList = new ArrayList<>();
498 List<String> posList = new ArrayList<>();
499 for (Element lexeme : lexemes) {
500 String lemmaName = lexeme.getAttribute("name");
501 String lemmaPos = lexeme.getAttribute("POS");
502
503
504
505
506
507
508
509 lexemeList.add(lemmaName);
510 posList.add(lemmaPos);
511 }
512
513 String goodLemma = String.join(" ", lexemeList);
514 String uriLemma = String.join("+", lexemeList);
515 URI lexicalEntryURI = addLexicalEntry(goodLemma, uriLemma, lexemeList, posList, pos, getLexicon());
516 URI luURI = getLuURI(pos, uriLemma, frameName.toLowerCase());
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535 lus.put(leIdentifier, luURI);
536
537 Match stElements = JOOX.$(lexUnit).children("semType");
538 addSemTypes(stElements, semTypesFreq, semTypesForFrame, luURI, frameURI);
539
540 addStatementToSink(luURI, RDF.TYPE, PMOFN.LEXICAL_UNIT);
541 addStatementToSink(luURI, PMO.EVOKED_CONCEPT, frameURI);
542 addStatementToSink(luURI, PMO.EVOKING_ENTRY, lexicalEntryURI);
543 addStatementToSink(luURI, LEXINFO.PART_OF_SPEECH_P, getPosURI(pos));
544 addStatementToSink(luURI, DCTERMS.IDENTIFIER, Integer.parseInt(leIdentifier));
545 addStatementToSink(luURI, SKOS.DEFINITION, leDefinition);
546 addStatementToSink(luURI, RDFS.LABEL, name, false);
547
548 URI statusURI = getStatusURI(status);
549 addStatementToSink(luURI, PMOFN.STATUS, statusURI);
550
551 URI leCByURI = addCBy(leCBy);
552 Date leDate = format.parse(leCDate);
553
554 addStatementToSink(luURI, DCTERMS.CREATOR, leCByURI);
555 addStatementToSink(luURI, DCTERMS.CREATED, leDate);
556 addStatementToSink(lexicalEntryURI, ONTOLEX.EVOKES, frameURI);
557
558
559
560
561
562
563
564
565
566
567
568 if (incorporatedFE != null && incorporatedFE.trim().length() > 0) {
569 incorporatedFE = incorporatedFE.trim();
570 if (FEs.contains(incorporatedFE)) {
571 URI argumentURI = uriForArgument(frameName.toLowerCase(),
572 incorporatedFE.toLowerCase());
573 addStatementToSink(luURI, PMOFN.INCORPORATED_FRAME_ELEMENT, argumentURI);
574 }
575 }
576 }
577
578 }
579
580 } catch (final Exception ex) {
581 throw new IOException(ex);
582 }
583 }
584 }
585
586 LOGGER.info("Extracted {} lexical units", luCount);
587 LOGGER.info("Extracted {} class mappings", mapCount);
588 LOGGER.info("Extracted {} role mappings", mapRoleCount);
589
590 int semTypesCount = 0;
591 for (URI uri : semTypesFreq.keySet()) {
592 semTypesCount += semTypesFreq.get(uri);
593 }
594 LOGGER.info("Extracted {} semantic types", semTypesCount);
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615 if (extractExamples) {
616
617 int totalCount = 0;
618 int skippedCount = 0;
619
620 LOGGER.info("Extracting examples");
621 for (final File file : Files.fileTreeTraverser().preOrderTraversal(paths.get("lu"))) {
622 if (!file.isDirectory() && file.getName().endsWith(".xml")) {
623 LOGGER.debug("Processing {} ...", file);
624
625 try {
626 document = dbf.newDocumentBuilder().parse(file);
627 final Match lexUnits = JOOX.$(document.getElementsByTagName("lexUnit"));
628 String frameName = lexUnits.attr("frame");
629 String luID = lexUnits.attr("ID");
630
631
632 if (lus.get(luID) == null) {
633 LOGGER.error("LU {} is not present in Map", luID);
634 continue;
635 }
636
637 URI frameURI = uriForRoleset(frameName.toLowerCase());
638 URI luURI = lus.get(luID);
639
640 final Match examples = JOOX.$(document.getElementsByTagName("sentence"));
641
642 for (Element example : examples) {
643
644 synchronized (this) {
645
646
647
648
649 Collection<Statement> tempStatements = new ArrayList<>();
650 RDFHandler tempSink = RDFHandlers.wrap(tempStatements);
651 boolean keep = true;
652 setSink(tempSink);
653
654
655
656 boolean hasTarget = false;
657 URI asURI = null;
658 totalCount++;
659
660 String id = example.getAttribute("ID");
661 URI exampleURI = uriForExample(id);
662
663 String text = JOOX.$(example.getElementsByTagName("text")).text();
664
665 Match layers = JOOX.$(example.getElementsByTagName("layer"));
666
667 Set<Integer> starts = new HashSet<>();
668 Set<Integer> ends = new HashSet<>();
669
670 Matcher matcher = TOKEN_REGEX.matcher(text);
671
672 while (matcher.find()) {
673 starts.add(matcher.start());
674 ends.add(matcher.end() - 1);
675 }
676
677 if (starts.size() == 0 || ends.size() == 0) {
678 LOGGER.error("A set is empty");
679 keep = false;
680 }
681
682
683 for (Element layer : layers) {
684 String layerName = layer.getAttribute("name");
685
686 if (layerName == null) {
687 continue;
688 }
689
690 if (layerName.equals("Target")) {
691 Match labels = JOOX.$(layer.getElementsByTagName("label"));
692
693 Integer targetStart = null;
694 Integer targetEnd = null;
695
696 for (Element label : labels) {
697
698 Integer start = null;
699 Integer end = null;
700 try {
701 start = Integer.parseInt(label.getAttribute("start"));
702 end = Integer.parseInt(label.getAttribute("end"));
703 } catch (Exception e) {
704
705 }
706
707 if (start != null && !starts.contains(start)) {
708 LOGGER.debug("Error in start index, skipping ({} - {})", luID,
709 text);
710 continue;
711 }
712 if (end != null && !ends.contains(end)) {
713 LOGGER.debug("Error in end index, skipping ({} - {})", luID, text);
714 continue;
715 }
716
717 if (start != null) {
718 if (targetStart == null || targetStart > start) {
719 targetStart = start;
720 }
721 }
722 if (end != null) {
723 if (targetEnd == null || targetEnd < end) {
724 targetEnd = end;
725 }
726 }
727 }
728
729 if (targetStart == null) {
730 LOGGER.debug("Target start is null");
731 continue;
732 }
733 if (targetEnd == null) {
734 LOGGER.debug("Target end is null");
735 continue;
736 }
737
738 hasTarget = true;
739 asURI = createURI(exampleURI + "_annSet_" + targetStart);
740 addStatementToSink(asURI, RDF.TYPE, PMO.ANNOTATION_SET, EXAMPLE_GRAPH);
741
742 URI markableURI = uriForMarkable(exampleURI, targetStart, targetEnd);
743 String anchor = text.substring(targetStart, targetEnd);
744
745 URI aURI = createURI(asURI + "_pred");
746 addStatementToSink(aURI, RDF.TYPE, NIF.ANNOTATION_C, EXAMPLE_GRAPH);
747 addStatementToSink(asURI, PMO.ITEM, aURI, EXAMPLE_GRAPH);
748 addStatementToSink(aURI, PMO.VALUE_OBJ, frameURI, EXAMPLE_GRAPH);
749 addStatementToSink(aURI, PMO.VALUE_OBJ, lus.get(luID), EXAMPLE_GRAPH);
750 addStatementToSink(exampleURI, NIF.ANNOTATION_P, aURI, EXAMPLE_GRAPH);
751
752 addStatementToSink(markableURI, RDF.TYPE, PMO.MARKABLE,
753 EXAMPLE_GRAPH);
754 addStatementToSink(markableURI, NIF.ANCHOR_OF, anchor, EXAMPLE_GRAPH);
755
756
757 addStatementToSink(markableURI, NIF.ANNOTATION_P, aURI, EXAMPLE_GRAPH);
758 addStatementToSink(markableURI, NIF.BEGIN_INDEX, targetStart,
759 EXAMPLE_GRAPH);
760 addStatementToSink(markableURI, NIF.END_INDEX, targetEnd, EXAMPLE_GRAPH);
761 addStatementToSink(markableURI, NIF.REFERENCE_CONTEXT, exampleURI,
762 EXAMPLE_GRAPH);
763 }
764 }
765
766 if (!hasTarget) {
767 LOGGER.debug("Skipped example: {} in {}", id, luID);
768 keep = false;
769 }
770
771 addStatementToSink(exampleURI, RDF.TYPE, PMO.EXAMPLE, EXAMPLE_GRAPH);
772 addStatementToSink(exampleURI, NIF.IS_STRING, text, EXAMPLE_GRAPH);
773
774
775
776
777 int i = 0;
778 if (hasTarget) {
779 for (Element layer : layers) {
780 String layerName = layer.getAttribute("name");
781
782 if (layerName == null) {
783 continue;
784 }
785
786 if (layerName.equals("FE")) {
787 Match labels = JOOX.$(layer.getElementsByTagName("label"));
788 for (Element label : labels) {
789 String roleName = label.getAttribute("name");
790 URI argumentURI = uriForArgument(frameName.toLowerCase(),
791 roleName.toLowerCase());
792
793 String anchor = null;
794
795 Integer start = null;
796 Integer end = null;
797 try {
798 start = Integer.parseInt(label.getAttribute("start"));
799 end = Integer.parseInt(label.getAttribute("end"));
800
801 if (start + end > 0) {
802 anchor = text.substring(start, end);
803 }
804 } catch (Exception e) {
805
806 }
807
808 if (start != null && !starts.contains(start)) {
809 LOGGER.debug("Error in start index, skipping ({} - {})", luID,
810 text);
811 keep = false;
812 continue;
813 }
814 if (end != null && !ends.contains(end)) {
815 LOGGER.debug("Error in end index, skipping ({} - {})", luID,
816 text);
817 keep = false;
818 continue;
819 }
820
821 i++;
822
823 URI aURI = createURI(asURI + "_arg" + i);
824 addStatementToSink(asURI, PMO.ITEM, aURI, EXAMPLE_GRAPH);
825 addStatementToSink(aURI, RDF.TYPE, NIF.ANNOTATION_C, EXAMPLE_GRAPH);
826 addStatementToSink(aURI, PMO.VALUE_OBJ, argumentURI, EXAMPLE_GRAPH);
827 addStatementToSink(exampleURI, NIF.ANNOTATION_P, aURI,
828 EXAMPLE_GRAPH);
829
830 if (anchor == null) {
831
832 } else {
833
834 URI markableURI = uriForMarkable(exampleURI, start, end);
835
836 addStatementToSink(markableURI, RDF.TYPE, PMO.MARKABLE,
837 EXAMPLE_GRAPH);
838 addStatementToSink(markableURI, NIF.ANCHOR_OF, anchor,
839 EXAMPLE_GRAPH);
840 addStatementToSink(markableURI, NIF.ANNOTATION_P, aURI,
841 EXAMPLE_GRAPH);
842 addStatementToSink(markableURI, NIF.BEGIN_INDEX, start,
843 EXAMPLE_GRAPH);
844 addStatementToSink(markableURI, NIF.END_INDEX, end,
845 EXAMPLE_GRAPH);
846 addStatementToSink(markableURI, NIF.REFERENCE_CONTEXT,
847 exampleURI, EXAMPLE_GRAPH);
848 }
849 }
850 }
851 }
852 }
853
854 setDefaultSinkAsSink();
855
856 if (!keep) {
857 skippedCount++;
858 continue;
859 }
860
861 for (Statement statement : tempStatements) {
862 addStatementToSink(statement);
863 }
864 }
865 }
866
867
868 setDefaultSinkAsSink();
869
870 } catch (final Exception ex) {
871 throw new IOException(ex);
872 }
873 }
874 }
875
876 LOGGER.info("Extracted examples: {}/{}", totalCount - skippedCount, totalCount);
877 }
878
879 } catch (final Exception ex) {
880 throw new IOException(ex);
881 }
882 }
883
884 private void addSemTypes(Match stElements, FrequencyHashSet<URI> semTypesFreq,
885 FrequencyHashSet<URI> semTypesForFrame, URI baseURI, URI frameURI) {
886 for (Element stElement : stElements) {
887 String LUSemType = stElement.getAttribute("name");
888 URI LUSemTypeURI = null;
889 if (LUSemType != null) {
890 LUSemTypeURI = getSemTypeURI(LUSemType);
891 }
892 if (LUSemTypeURI != null) {
893 addStatementToSink(baseURI, PMOFN.SEM_TYPE_P, LUSemTypeURI);
894 semTypesFreq.add(LUSemTypeURI);
895 semTypesForFrame.add(frameURI);
896 }
897 }
898 }
899
900 private URI addCBy(String cBy) {
901 URI cbyURI = uriForCBy(cBy);
902 addStatementToSink(cbyURI, DCTERMS.IDENTIFIER, cBy, false);
903 return cbyURI;
904 }
905
906 private URI uriForCBy(String cBy) {
907 StringBuilder builder = new StringBuilder();
908 builder.append(NAMESPACE);
909 builder.append(cBy.toLowerCase());
910 builder.append("_Creator");
911 return createURI(builder.toString());
912 }
913
914 private URI uriForExample(String exampleID) {
915 StringBuilder builder = new StringBuilder();
916 builder.append(NAMESPACE);
917 builder.append(prefix).append(separator);
918 builder.append("example_");
919 builder.append(exampleID);
920 return createURI(builder.toString());
921 }
922
923 private void addSemTypeToSink(Element semType) {
924 String name = semType.getAttribute("name");
925 String abbrev = semType.getAttribute("abbrev");
926 String id = semType.getAttribute("ID");
927
928 String definition = JOOX.$(semType.getElementsByTagName("definition")).text();
929 String supID = JOOX.$(semType.getElementsByTagName("superType")).attr("supID");
930 String superTypeName = JOOX.$(semType.getElementsByTagName("superType")).attr("superTypeName");
931
932 URI semTypeURI = getSemTypeURI(name);
933
934 addStatementToSink(semTypeURI, RDF.TYPE, PMOFN.SEM_TYPE_C);
935 addStatementToSink(semTypeURI, DCTERMS.IDENTIFIER, Integer.parseInt(id));
936 addStatementToSink(semTypeURI, RDFS.LABEL, name, false);
937 addStatementToSink(semTypeURI, SKOS.DEFINITION, definition);
938 addStatementToSink(semTypeURI, PMO.ABBREVIATION, abbrev, false);
939
940 if (supID != null) {
941 URI superSemTypeURI = getSemTypeURI(superTypeName);
942 addStatementToSink(semTypeURI, PMOFN.SUB_TYPE_OF, superSemTypeURI);
943 }
944 }
945
946 private URI getSemTypeURI(String name) {
947 return getSemTypeURI(name, null);
948 }
949
950 private URI getSemTypeURI(String name, @Nullable String prefix) {
951 if (prefix == null) {
952 prefix = this.prefix;
953 }
954
955 name = name.toLowerCase();
956 name = name.replaceAll("-", "_");
957 name = name.replaceAll("\\s+", "_");
958
959 StringBuilder builder = new StringBuilder();
960 builder.append(NAMESPACE);
961 builder.append(prefix);
962 builder.append(separator);
963 builder.append(name);
964 builder.append("_semType");
965 return createURI(builder.toString());
966 }
967
968 private void addStatusToSink(Element statusType) {
969 String name = statusType.getAttribute("name");
970 String description = statusType.getAttribute("description");
971
972 URI statusURI = getStatusURI(name);
973 addStatementToSink(statusURI, RDF.TYPE, PMOFN.LUSTATUS);
974 addStatementToSink(statusURI, RDFS.LABEL, name, false);
975 addStatementToSink(statusURI, SKOS.DEFINITION, description);
976 }
977
978 private URI getLuURI(String pos, String luName, String frameName) {
979 return getLuURI(pos, luName, frameName, null);
980 }
981
982 private URI getLuURI(String pos, String luName, String frameName, @Nullable String prefix) {
983 if (prefix == null) {
984 prefix = this.prefix;
985 }
986 StringBuilder builder = new StringBuilder();
987 builder.append(NAMESPACE);
988 builder.append(CONCEPTUALIZATION_PREFIX);
989 builder.append(separator);
990
991
992
993
994 builder.append(LEXINFO.map.get(getPosURI(pos)));
995
996 builder.append(separator);
997 builder.append(luName.replaceAll("[^a-zA-Z0-9-_+]", ""));
998 builder.append(separator);
999 builder.append(rolesetPart(frameName, prefix));
1000 return createURI(builder.toString());
1001 }
1002
1003 private URI getStatusURI(String name) {
1004 return getStatusURI(name, null);
1005 }
1006
1007 private URI getStatusURI(String name, @Nullable String prefix) {
1008 if (prefix == null) {
1009 prefix = this.prefix;
1010 }
1011 StringBuilder builder = new StringBuilder();
1012 builder.append(NAMESPACE);
1013 builder.append(prefix);
1014 builder.append(separator);
1015 builder.append(name.toLowerCase());
1016 builder.append("_LUStatus");
1017 return createURI(builder.toString());
1018 }
1019
1020 protected URI getPosURI(String pos) {
1021
1022 pos = pos.toUpperCase();
1023
1024 switch (pos) {
1025 case "A":
1026 return LEXINFO.ADJECTIVE;
1027 case "ADV":
1028 return LEXINFO.ADVERB;
1029 case "ART":
1030 return LEXINFO.DETERMINER;
1031 case "C":
1032 return LEXINFO.CONJUNCTION;
1033 case "INTJ":
1034 return LEXINFO.INTERJECTION;
1035 case "N":
1036 return LEXINFO.NOUN;
1037 case "NUM":
1038 return LEXINFO.CARDINAL_NUMERAL;
1039 case "PREP":
1040 return LEXINFO.PREPOSITION;
1041 case "PRON":
1042 return LEXINFO.PRONOUN;
1043 case "SCON":
1044 return LEXINFO.SUBORDINATING_CONJUNCTION;
1045 case "V":
1046 return LEXINFO.VERB;
1047 case "IDIO":
1048 return PMO.IDIOSYNCRATIC;
1049 case "AVP":
1050 return LEXINFO.PARTICLE;
1051 }
1052
1053 return null;
1054 }
1055
1056 @Override public String getArgLabel() {
1057 return "";
1058 }
1059
1060 @Override protected String formatArg(String arg) {
1061 return super.formatArg(arg).toLowerCase();
1062 }
1063 }