1 package eu.fbk.dkm.premon.premonitor;
2
3 import com.google.common.io.Files;
4 import eu.fbk.dkm.premon.vocab.LEXINFO;
5 import eu.fbk.dkm.premon.vocab.PMO;
6 import org.apache.commons.csv.CSVFormat;
7 import org.apache.commons.csv.CSVParser;
8 import org.apache.commons.csv.CSVRecord;
9 import org.joox.JOOX;
10 import org.joox.Match;
11 import org.openrdf.model.URI;
12 import org.openrdf.model.vocabulary.RDF;
13 import org.openrdf.model.vocabulary.RDFS;
14 import org.openrdf.rio.RDFHandler;
15 import org.w3c.dom.Document;
16 import org.w3c.dom.Element;
17
18 import javax.xml.parsers.DocumentBuilderFactory;
19 import java.io.File;
20 import java.io.IOException;
21 import java.nio.charset.Charset;
22 import java.util.*;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25
26 public class PredMatConverter extends Converter {
27
28 final private int ID_LANG = 1 -1;
29 final private int VN_SUBCLASS = 7 -1;
30 final private int VN_CLASS = 5 -1;
31 final private int VN_LEMA = 9 -1;
32 final private int VN_ROLE = 10 -1;
33 final private int FN_FRAME = 13 -1;
34 final private int FN_LE = 14 -1;
35 final private int FN_FRAME_ELEMENT = 15 -1;
36 final private int PB_ROLESET = 16 -1;
37 final private int PB_ARG = 17 -1;
38 final private int WN_SENSE = 11 -1;
39
40 final private String FILE_NAME = "PredicateMatrix.v1.3.txt";
41
42 private static final Pattern VN_PATTERN = Pattern.compile("([^-]+)-(.*)");
43 private static final Pattern WN_PATTERN = Pattern.compile("#([^#]+)$");
44
45 private static final String DEFAULT_TYPE = "v";
46
47 private Map<String, String> vnMap = new HashMap<>();
48
49 private ArrayList<String> vnLinks = new ArrayList<>();
50 private ArrayList<String> fnLinks = new ArrayList<>();
51 private ArrayList<String> pbLinks = new ArrayList<>();
52
53 protected Set entries = new HashSet();
54
55 public PredMatConverter(File path, RDFHandler sink, Properties properties, Map<String, URI> wnInfo){
56 super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
57
58
59 addLinks(pbLinks, properties.getProperty("linkpb"));
60 addLinks(fnLinks, properties.getProperty("linkfn"));
61 addLinks(vnLinks, properties.getProperty("linkvn"));
62
63 String vnPath = properties.getProperty("vnpath");
64 if (vnPath != null) {
65 LOGGER.info("Loading VerbNet");
66 File vnFile = new File(vnPath);
67 if (vnFile.exists() && vnFile.isDirectory()) {
68 final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
69
70 for (final File file : Files.fileTreeTraverser().preOrderTraversal(vnFile)) {
71 if (!file.isDirectory() && file.getName().endsWith(".xml")) {
72 LOGGER.debug("Processing {} ...", file);
73
74 try {
75 final Document document = dbf.newDocumentBuilder().parse(file);
76 final Match vnClass = JOOX.$(document.getElementsByTagName("VNCLASS"))
77 .add(JOOX.$(document.getElementsByTagName("VNSUBCLASS")));
78
79 for (Element thisClass : vnClass) {
80 String id = thisClass.getAttribute("ID");
81 Matcher mID = VN_PATTERN.matcher(id);
82 if (mID.find()) {
83 vnMap.put(mID.group(2), mID.group(1));
84 } else {
85 LOGGER.error("Unable to parse {}", id);
86 }
87 }
88
89 } catch (final Exception ex) {
90 ex.printStackTrace();
91 }
92 }
93 }
94
95 }
96 }
97
98 LOGGER.info("Links to: {}", pbLinks.toString());
99 LOGGER.info("Links to: {}", vnLinks.toString());
100 LOGGER.info("Links to: {}", fnLinks.toString());
101 LOGGER.info("Starting dataset: {}", prefix);
102 }
103
104 @Override public void convert() throws IOException {
105
106 addMetaToSink();
107
108 File PredMat = new File(this.path + File.separator + FILE_NAME);
109
110 try{
111
112 List<URI> classes = new ArrayList<URI>();
113 List<URI> conceptualizations = new ArrayList<URI>();
114 List<URI> arguments = new ArrayList<URI>();
115
116 CSVParser parser = CSVParser.parse(PredMat, Charset.defaultCharset(), CSVFormat.TDF.withSkipHeaderRecord());
117
118 for(CSVRecord pme : parser) {
119
120 String lang = pme.get(ID_LANG);
121 if(lang.compareToIgnoreCase("id:eng")!=0){
122 continue;
123 }
124
125 String vnSc = pme.get(VN_SUBCLASS).compareToIgnoreCase("vn:null") == 0?
126 pme.get(VN_CLASS) : pme.get(VN_SUBCLASS);
127 String vnLe = pme.get(VN_LEMA);
128 String vnSr = pme.get(VN_ROLE);
129 vnSr = vnSr.toLowerCase();
130
131 String fnSc = pme.get(FN_FRAME);
132 fnSc = fnSc.toLowerCase();
133 String fnLe = pme.get(FN_LE);
134 fnLe = fnLe.compareToIgnoreCase("fn:null") == 0?fnLe:fnLe.substring(0, fnLe.length()-2);
135 String fnSr = pme.get(FN_FRAME_ELEMENT);
136 fnSr = fnSr.toLowerCase();
137
138 String pbSc = pme.get(PB_ROLESET);
139 String pbSr = pme.get(PB_ARG);
140 pbSr = pbSr.toLowerCase();
141
142 String wnSense = pme.get(WN_SENSE);
143
144 vnSc = removeNameSpace(vnSc); vnLe = removeNameSpace(vnLe); vnSr = removeNameSpace(vnSr);
145 fnSc = removeNameSpace(fnSc); fnLe = removeNameSpace(fnLe); fnSr = removeNameSpace(fnSr);
146 pbSc = removeNameSpace(pbSc); pbSr = removeNameSpace(pbSr);
147 wnSense = removeNameSpace(wnSense);
148
149 pbSr = pbSr == null? null : "arg" + pbSr;
150
151 String hash = vnSc + vnLe + vnSr + fnSc + fnLe + fnSr + pbSc + pbSr + wnSense;
152 if(!entries.add(hash.hashCode())){
153 notadded++;
154 total++;
155 continue;
156 }
157 added++;
158 total++;
159
160 for (String vnLink : vnLinks) {
161 for (String fnLink : fnLinks) {
162 for (String pbLink : pbLinks) {
163 String vnID = vnSc == null? null : getVnID(vnSc);
164 URI vnClassURI = vnID == null? null : uriForRoleset(vnID, vnLink);
165 URI vnConceptualizationURI = vnLe == null || vnID == null?
166 null : uriForConceptualizationWithPrefix(vnLe, DEFAULT_TYPE, vnID, vnLink);
167 URI vnArgURI = vnSr == null || vnID == null? null : uriForArgument(vnID, vnSr, vnLink);
168
169 URI fnFrameURI = fnSc == null? null : uriForRoleset(fnSc, fnLink);
170 URI fnConceptualizationURI = fnLe == null || fnSc == null?
171 null : uriForConceptualizationWithPrefix(fnLe, DEFAULT_TYPE, fnSc, fnLink);
172 URI fnArgURI = fnSr == null || fnSc == null? null : uriForArgument(fnSc, fnSr, fnLink);
173
174 URI pbRolesetURI = pbSc == null? null : uriForRoleset(pbSc, pbLink);
175 URI pbConceptualizationURI = pbSc == null?
176 null : uriForConceptualizationWithPrefix(pbSc.substring(0, pbSc.indexOf(".")),
177 DEFAULT_TYPE, pbSc, pbLink);
178 URI pbArgURI = pbSr == null || pbSc == null? null : uriForArgument(pbSc, pbSr, pbLink);
179
180 URI wnSenseURI = wnSense == null?
181 null : uriForWnSense(wnSense, wnSense.substring(0,wnSense.indexOf("%")));
182
183
184 if(vnClassURI != null){
185 classes.add(vnClassURI);
186 }if(fnFrameURI != null){
187 classes.add(fnFrameURI);
188 }if(pbRolesetURI != null){
189 classes.add(pbRolesetURI);
190 }
191
192 if(vnConceptualizationURI != null){
193 conceptualizations.add(vnConceptualizationURI);
194 }if(fnConceptualizationURI != null){
195 conceptualizations.add(fnConceptualizationURI);
196 }if(pbConceptualizationURI != null){
197 conceptualizations.add(pbConceptualizationURI);
198 }if(wnSenseURI != null) {
199 conceptualizations.add(wnSenseURI);
200 }
201
202 if(vnArgURI != null){
203 arguments.add(vnArgURI);
204 }if(fnArgURI != null){
205 arguments.add(fnArgURI);
206 }if(pbArgURI != null){
207 arguments.add(pbArgURI);
208 }
209
210 addMappings(classes, conceptualizations, arguments);
211
212 classes.clear();
213 conceptualizations.clear();
214 arguments.clear();
215 }
216 }
217 }
218
219 }
220
221 LOGGER.info("Element added: " + added + ", not added: " + notadded + " of " + total);
222 LOGGER.info("Class mappings: {}, Conceptualization mappings: {}, Role mappings: {}", nclass, ncon, nrole);
223 }catch (IOException e){
224 throw e;
225 }
226 }
227
228 @Override protected URI getPosURI(String textualPOS) {
229 return LEXINFO.VERB;
230 }
231
232 @Override public String getArgLabel() {
233 return "";
234 }
235
236 private String removeNameSpace(String str){
237 String strNoNS = str.substring(3, str.length());
238 if(strNoNS.compareToIgnoreCase("NULL") == 0){
239 return null;
240 }else{
241 return strNoNS;
242 }
243 }
244
245 private String getVnID(String vnSc){
246 String vnID = vnMap.get(vnSc);
247
248 if (vnID == null) {
249 LOGGER.error("VerbNet ID {} not found", vnSc);
250 vnID = null;
251 }else{
252 vnID = vnID + "-" + vnSc;
253 }
254
255 return vnID;
256 }
257
258 private URI uriForWnSense(String wnSense, String uriLemma){
259
260 URI wnConceptualizationURI = null;
261
262 if (wnSense != null && this.wnInfo.size() > 0) {
263 final String[] wns = wnSense.split("\\s+");
264
265 for (String wn : wns) {
266
267 wn = wn.trim();
268
269 if (wn.length() == 0) {
270 continue;
271 }
272
273
274
275
276
277
278
279
280 final URI wnURI = this.wnInfo.get(wn);
281
282 if (wnURI == null) {
283 LOGGER.warn("No wnURI found for {}", wn);
284 continue;
285 }
286
287 String lemma = wn.substring(0, wn.indexOf('%'));
288 final URI reference = this.wnInfo.get(wnURI.toString() + "|" + lemma);
289
290 if (reference == null) {
291 LOGGER.warn("No reference found for {} / {}", wnURI.toString(), lemma);
292 continue;
293 }
294
295 final Matcher m = WN_PATTERN.matcher(reference.toString());
296 if (!m.find()) {
297 continue;
298 }
299
300 String wnuri = wnURI.toString();
301 URI type = null;
302 if(wnuri.endsWith("-v")){
303 type = LEXINFO.VERB;
304 }else if(wnuri.endsWith("-n")){
305 type = LEXINFO.NOUN;
306 }else if(wnuri.endsWith("-r")){
307 type = LEXINFO.ADVERB;
308 }else if(wnuri.endsWith("-a")){
309 type = LEXINFO.ADJECTIVE;
310 }
311
312 URI lexicalEntryURI = uriForLexicalEntry(lemma, type);
313
314 wnConceptualizationURI = uriForConceptualizationWithPrefix(uriLemma,
315 "v", wnURI.toString().replace(WN_NAMESPACE,""), "wn31");
316
317 addStatementToSink(wnConceptualizationURI, RDF.TYPE, PMO.CONCEPTUALIZATION);
318 addStatementToSink(wnConceptualizationURI, PMO.EVOKING_ENTRY, lexicalEntryURI);
319 addStatementToSink(wnConceptualizationURI, RDFS.SEEALSO, reference);
320 addStatementToSink(wnConceptualizationURI, PMO.EVOKED_CONCEPT, wnURI);
321 }
322 }
323 return wnConceptualizationURI;
324 }
325
326 }