1 package eu.fbk.dkm.premon.premonitor;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.util.ArrayList;
6 import java.util.HashMap;
7 import java.util.Map;
8 import java.util.Properties;
9 import java.util.Set;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12
13 import javax.xml.parsers.DocumentBuilderFactory;
14
15 import com.google.common.collect.HashMultimap;
16 import com.google.common.io.Files;
17
18 import org.joox.JOOX;
19 import org.joox.Match;
20 import org.openrdf.model.URI;
21 import org.openrdf.rio.RDFHandler;
22 import org.slf4j.Logger;
23 import org.slf4j.LoggerFactory;
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26
27 import eu.fbk.dkm.premon.vocab.LEXINFO;
28
29
30
31
32
33
34
35
36 public class SemlinkConverter extends Converter {
37
38 private static final Logger LOGGER = LoggerFactory.getLogger(SemlinkConverter.class);
39 private static final Pattern VN_PATTERN = Pattern.compile("([^-]+)-(.*)");
40 private static final Pattern VN_SC_PATTERN = Pattern.compile("(.*)-[0-9]+");
41
42
43
44
45
46
47
48
49
50 private static final String DEFAULT_TYPE = "v";
51
52 protected Map<String, String> vnMap = new HashMap<>();
53
54 ArrayList<String> pbLinks = new ArrayList<>();
55 ArrayList<String> vnLinks = new ArrayList<>();
56 ArrayList<String> fnLinks = new ArrayList<>();
57
58 public SemlinkConverter(File path, RDFHandler sink, Properties properties, Map<String, URI> wnInfo) {
59 super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
60
61 addLinks(pbLinks, properties.getProperty("linkpb"));
62 addLinks(fnLinks, properties.getProperty("linkfn"));
63 addLinks(vnLinks, properties.getProperty("linkvn"));
64
65 String vnPath = properties.getProperty("vnpath");
66 if (vnPath != null) {
67 LOGGER.info("Loading VerbNet");
68 File vnFile = new File(vnPath);
69 if (vnFile.exists() && vnFile.isDirectory()) {
70 final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
71
72 for (final File file : Files.fileTreeTraverser().preOrderTraversal(vnFile)) {
73 if (!file.isDirectory() && file.getName().endsWith(".xml")) {
74 LOGGER.debug("Processing {} ...", file);
75
76 try {
77 final Document document = dbf.newDocumentBuilder().parse(file);
78 final Match vnClass = JOOX.$(document.getElementsByTagName("VNCLASS"))
79 .add(JOOX.$(document.getElementsByTagName("VNSUBCLASS")));
80
81 for (Element thisClass : vnClass) {
82 String id = thisClass.getAttribute("ID");
83 Matcher mID = VN_PATTERN.matcher(id);
84 if (mID.find()) {
85 vnMap.put(mID.group(2), mID.group(1));
86 } else {
87 LOGGER.error("Unable to parse {}", id);
88 }
89 }
90
91 } catch (final Exception ex) {
92 ex.printStackTrace();
93 }
94 }
95 }
96
97 }
98 }
99
100 LOGGER.info("Links to: {}", pbLinks.toString());
101 LOGGER.info("Links to: {}", vnLinks.toString());
102 LOGGER.info("Links to: {}", fnLinks.toString());
103 LOGGER.info("Starting dataset: {}", prefix);
104 }
105
106 @Override public void convert() throws IOException {
107
108 addMetaToSink();
109
110 final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
111
112 File vnPbMappings = new File(this.path + File.separator + "vn-pb" + File.separator + "vnpbMappings");
113 File vnFnMappings = new File(this.path + File.separator + "vn-fn" + File.separator + "VNC-FNF.s");
114 File vnFnMappingsRole = new File(
115 this.path + File.separator + "vn-fn" + File.separator + "VN-FNRoleMapping.txt");
116
117 Document document;
118
119 try {
120
121 LOGGER.debug("Processing {} ...", vnPbMappings);
122 document = dbf.newDocumentBuilder().parse(vnPbMappings);
123 final Match predicates = JOOX.$(document.getElementsByTagName("predicate"));
124
125 for (Element predicate : predicates) {
126 String lemma = predicate.getAttribute("lemma");
127 String uriLemma = BankConverter.getLemmaFromPredicateName(lemma);
128
129 final Match argmaps = JOOX.$(predicate.getElementsByTagName("argmap"));
130 for (Element argmap : argmaps) {
131 String pbRoleset = argmap.getAttribute("pb-roleset");
132 String vnClass = argmap.getAttribute("vn-class");
133
134 String vnID = vnMap.get(vnClass);
135 if (vnID == null) {
136 LOGGER.error("VerbNet ID {} not found", vnClass);
137 vnID = "INVALID";
138
139 }
140 vnID = vnID + "-" + vnClass;
141
142 addMapping(pbLinks, vnLinks, uriLemma, pbRoleset, vnID);
143
144 final Match roles = JOOX.$(argmap.getElementsByTagName("role"));
145 for (Element role : roles) {
146 String pbArg = "arg" + role.getAttribute("pb-arg");
147 String vnTheta = role.getAttribute("vn-theta");
148
149 vnTheta = vnTheta.toLowerCase();
150
151 for (String pbLink : pbLinks) {
152 for (String vnLink : vnLinks) {
153
154 URI pbRolesetURI = uriForRoleset(pbRoleset, pbLink);
155 URI pbConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, pbRoleset, pbLink);
156 URI pbArgURI = uriForArgument(pbRoleset, pbArg, pbLink);
157
158 URI vnClassURI = uriForRoleset(vnID, vnLink);
159 URI vnConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, vnID, vnLink);
160 URI vnArgURI = uriForArgument(vnID, vnTheta, vnLink);
161
162 addMappings(vnClassURI, pbRolesetURI, vnConceptualizationURI, pbConceptualizationURI, vnArgURI, pbArgURI);
163
164
165
166
167
168
169
170 }
171 }
172 }
173
174 }
175
176 }
177
178 LOGGER.debug("Processing {} ...", vnFnMappings);
179 HashMultimap<String, String> vnfnMap = HashMultimap.create();
180 HashMultimap<String, String> vnfnLemmaMap = HashMultimap.create();
181 document = dbf.newDocumentBuilder().parse(vnFnMappings);
182 final Match vnClasses = JOOX.$(document.getElementsByTagName("vncls"));
183
184 for (Element vnClass : vnClasses) {
185 String vnCls = vnClass.getAttribute("class");
186 String lemma = vnClass.getAttribute("vnmember");
187 String uriLemma = BankConverter.getLemmaFromPredicateName(lemma);
188
189 String frame = vnClass.getAttribute("fnframe");
190 frame = frame.toLowerCase();
191
192 vnfnMap.put(vnCls, frame);
193
194 String vnID = vnMap.get(vnCls);
195 if (vnID == null) {
196 LOGGER.error("VerbNet ID {} not found", vnCls);
197 vnID = "INVALID";
198
199 }
200 vnID = vnID + "-" + vnCls;
201
202 vnfnLemmaMap.put(vnCls + "-" + frame, uriLemma + "|" + vnID);
203 LOGGER.trace("{} -> {}", vnCls, frame);
204
205 Matcher matcher = VN_SC_PATTERN.matcher(vnCls);
206 while (matcher.find()) {
207 String newVnCls = matcher.group(1);
208 vnfnMap.put(newVnCls, frame);
209 vnfnLemmaMap.put(newVnCls + "-" + frame, uriLemma + "|" + vnID);
210 LOGGER.trace("{} -> {}", newVnCls, frame);
211 matcher = VN_SC_PATTERN.matcher(newVnCls);
212 }
213
214 addMapping(fnLinks, vnLinks, uriLemma, frame, vnID);
215 }
216
217 LOGGER.debug("Processing {} ...", vnFnMappingsRole);
218 int notFound = 0;
219 document = dbf.newDocumentBuilder().parse(vnFnMappingsRole);
220 final Match vnClasses2 = JOOX.$(document.getElementsByTagName("vncls"));
221
222 for (Element vnClass : vnClasses2) {
223 String vnCls = vnClass.getAttribute("class");
224 String frame = vnClass.getAttribute("fnframe");
225
226 frame = frame.toLowerCase();
227
228 String vnID = vnMap.get(vnCls);
229 if (vnID == null) {
230 LOGGER.error("VerbNet ID {} not found", vnCls);
231 continue;
232 }
233 vnID = vnID + "-" + vnCls;
234
235
236 Set<String> frames = vnfnMap.get(vnCls);
237 if (!frames.contains(frame)) {
238 LOGGER.error("Mapping not found: {} -> {}", vnCls, frame);
239 notFound++;
240 continue;
241 }
242
243 Set<String> lemmas = vnfnLemmaMap.get(vnCls + "-" + frame);
244 if (lemmas.size() == 0) {
245 LOGGER.error("No lemmas for {}", vnCls + "-" + frame);
246 }
247
248 final Match roles = JOOX.$(vnClass.getElementsByTagName("role"));
249 for (Element role : roles) {
250 String vnTheta = role.getAttribute("vnrole");
251 String fnrole = role.getAttribute("fnrole");
252
253 vnTheta = vnTheta.toLowerCase();
254 fnrole = fnrole.toLowerCase();
255
256 for (String fnLink : fnLinks) {
257 for (String vnLink : vnLinks) {
258
259 for (String l : lemmas) {
260
261 int index = l.indexOf('|');
262 String lemma = l.substring(0, index);
263 String vnSubClass = l.substring(index + 1);
264
265 URI fnFrameURI = uriForRoleset(frame, fnLink);
266 URI fnConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, frame, fnLink);
267 URI fnArgURI = uriForArgument(frame, fnrole, fnLink);
268
269 URI vnClassURI = uriForRoleset(vnSubClass, vnLink);
270 URI vnConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, vnSubClass, vnLink);
271 URI vnArgURI = uriForArgument(vnSubClass, vnTheta, vnLink);
272
273 addMappings(vnClassURI, fnFrameURI, vnConceptualizationURI, fnConceptualizationURI, vnArgURI, fnArgURI);
274
275
276
277
278
279
280
281
282
283
284
285
286 }
287 }
288 }
289 }
290 }
291
292 LOGGER.info("Roles not mapped: {}", notFound);
293
294 } catch (final Exception ex) {
295 throw new IOException(ex);
296 }
297 }
298
299 private void addMapping(ArrayList<String> links1, ArrayList<String> links2, String uriLemma, String p1, String p2) {
300 for (String link1 : links1) {
301 for (String link2 : links2) {
302 URI firstRolesetURI = uriForRoleset(p1, link1);
303 URI secondRolesetURI = uriForRoleset(p2, link2);
304 URI firstConceptualizationURI = uriForConceptualizationWithPrefix(uriLemma, DEFAULT_TYPE, p1, link1);
305 URI secondConceptualizationURI = uriForConceptualizationWithPrefix(uriLemma, DEFAULT_TYPE, p2, link2);
306 addMappings(firstRolesetURI, secondRolesetURI, firstConceptualizationURI, secondConceptualizationURI);
307
308 }
309 }
310 }
311
312 @Override protected URI getPosURI(String textualPOS) {
313 return LEXINFO.VERB;
314 }
315
316 @Override public String getArgLabel() {
317 return "";
318 }
319
320 }