1   package eu.fbk.dkm.premon.premonitor;
2   
3   import java.io.File;
4   import java.io.IOException;
5   import java.util.ArrayList;
6   import java.util.HashMap;
7   import java.util.Map;
8   import java.util.Properties;
9   import java.util.Set;
10  import java.util.regex.Matcher;
11  import java.util.regex.Pattern;
12  
13  import javax.xml.parsers.DocumentBuilderFactory;
14  
15  import com.google.common.collect.HashMultimap;
16  import com.google.common.io.Files;
17  
18  import org.joox.JOOX;
19  import org.joox.Match;
20  import org.openrdf.model.URI;
21  import org.openrdf.rio.RDFHandler;
22  import org.slf4j.Logger;
23  import org.slf4j.LoggerFactory;
24  import org.w3c.dom.Document;
25  import org.w3c.dom.Element;
26  
27  import eu.fbk.dkm.premon.vocab.LEXINFO;
28  
29  /*
30      Problems on version 3.2b
31      - Apparently useless examples.desktop and localmachine files
32      - pronounce-29.3.1 has no xml extension
33      - sound_emission-32.2.xml.bckup has no xml extension
34   */
35  
36  public class SemlinkConverter extends Converter {
37  
38      private static final Logger LOGGER = LoggerFactory.getLogger(SemlinkConverter.class);
39      private static final Pattern VN_PATTERN = Pattern.compile("([^-]+)-(.*)");
40      private static final Pattern VN_SC_PATTERN = Pattern.compile("(.*)-[0-9]+");
41  
42  //    private static final Pattern WN_PATTERN = Pattern.compile("#([^#]+)$");
43  //    private static final String LINK_PATTERN = "http://verbs.colorado.edu/verb-index/vn/%s.php";
44  //
45  //    private static final String DEFAULT_RESTRICTION_SUFFIX = "srs";
46  //    private static final String DEFAULT_FRAME_SUFFIX = "frame";
47  //    private static final String DEFAULT_EXAMPLE_SUFFIX = "ex";
48  //    private static final String DEFAULT_SYNITEM_SUFFIX = "SynItem";
49  
50      private static final String DEFAULT_TYPE = "v";
51  
52      protected Map<String, String> vnMap = new HashMap<>();
53  
54      ArrayList<String> pbLinks = new ArrayList<>();
55      ArrayList<String> vnLinks = new ArrayList<>();
56      ArrayList<String> fnLinks = new ArrayList<>();
57  
58      public SemlinkConverter(File path, RDFHandler sink, Properties properties, Map<String, URI> wnInfo) {
59          super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
60  
61          addLinks(pbLinks, properties.getProperty("linkpb"));
62          addLinks(fnLinks, properties.getProperty("linkfn"));
63          addLinks(vnLinks, properties.getProperty("linkvn"));
64  
65          String vnPath = properties.getProperty("vnpath");
66          if (vnPath != null) {
67              LOGGER.info("Loading VerbNet");
68              File vnFile = new File(vnPath);
69              if (vnFile.exists() && vnFile.isDirectory()) {
70                  final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
71  
72                  for (final File file : Files.fileTreeTraverser().preOrderTraversal(vnFile)) {
73                      if (!file.isDirectory() && file.getName().endsWith(".xml")) {
74                          LOGGER.debug("Processing {} ...", file);
75  
76                          try {
77                              final Document document = dbf.newDocumentBuilder().parse(file);
78                              final Match vnClass = JOOX.$(document.getElementsByTagName("VNCLASS"))
79                                      .add(JOOX.$(document.getElementsByTagName("VNSUBCLASS")));
80  
81                              for (Element thisClass : vnClass) {
82                                  String id = thisClass.getAttribute("ID");
83                                  Matcher mID = VN_PATTERN.matcher(id);
84                                  if (mID.find()) {
85                                      vnMap.put(mID.group(2), mID.group(1));
86                                  } else {
87                                      LOGGER.error("Unable to parse {}", id);
88                                  }
89                              }
90  
91                          } catch (final Exception ex) {
92                              ex.printStackTrace();
93                          }
94                      }
95                  }
96  
97              }
98          }
99  
100         LOGGER.info("Links to: {}", pbLinks.toString());
101         LOGGER.info("Links to: {}", vnLinks.toString());
102         LOGGER.info("Links to: {}", fnLinks.toString());
103         LOGGER.info("Starting dataset: {}", prefix);
104     }
105 
106     @Override public void convert() throws IOException {
107 
108         addMetaToSink();
109 
110         final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
111 
112         File vnPbMappings = new File(this.path + File.separator + "vn-pb" + File.separator + "vnpbMappings");
113         File vnFnMappings = new File(this.path + File.separator + "vn-fn" + File.separator + "VNC-FNF.s");
114         File vnFnMappingsRole = new File(
115                 this.path + File.separator + "vn-fn" + File.separator + "VN-FNRoleMapping.txt");
116 
117         Document document;
118 
119         try {
120 
121             LOGGER.debug("Processing {} ...", vnPbMappings);
122             document = dbf.newDocumentBuilder().parse(vnPbMappings);
123             final Match predicates = JOOX.$(document.getElementsByTagName("predicate"));
124 
125             for (Element predicate : predicates) {
126                 String lemma = predicate.getAttribute("lemma");
127                 String uriLemma = BankConverter.getLemmaFromPredicateName(lemma);
128 
129                 final Match argmaps = JOOX.$(predicate.getElementsByTagName("argmap"));
130                 for (Element argmap : argmaps) {
131                     String pbRoleset = argmap.getAttribute("pb-roleset");
132                     String vnClass = argmap.getAttribute("vn-class");
133 
134                     String vnID = vnMap.get(vnClass);
135                     if (vnID == null) {
136                         LOGGER.error("VerbNet ID {} not found", vnClass);
137                         vnID = "INVALID"; // FC: will counted as invalid and dropped later 
138 //                        continue;
139                     }
140                     vnID = vnID + "-" + vnClass;
141 
142                     addMapping(pbLinks, vnLinks, uriLemma, pbRoleset, vnID);
143 
144                     final Match roles = JOOX.$(argmap.getElementsByTagName("role"));
145                     for (Element role : roles) {
146                         String pbArg = "arg" + role.getAttribute("pb-arg");
147                         String vnTheta = role.getAttribute("vn-theta");
148 
149                         vnTheta = vnTheta.toLowerCase();
150 
151                         for (String pbLink : pbLinks) {
152                             for (String vnLink : vnLinks) {
153 
154                                 URI pbRolesetURI = uriForRoleset(pbRoleset, pbLink);
155                                 URI pbConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, pbRoleset, pbLink);
156                                 URI pbArgURI = uriForArgument(pbRoleset, pbArg, pbLink);
157                                 
158                                 URI vnClassURI = uriForRoleset(vnID, vnLink);
159                                 URI vnConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, vnID, vnLink);
160                                 URI vnArgURI = uriForArgument(vnID, vnTheta, vnLink);
161                                 
162                                 addMappings(vnClassURI, pbRolesetURI, vnConceptualizationURI, pbConceptualizationURI, vnArgURI, pbArgURI);
163                                 
164                                 //    URI pbArgConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE,
165                                 //            pbRoleset, pbArg, pbLink);
166                                 //    URI vnArgConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE,
167                                 //            vnID, vnTheta, vnLink);
168                                 //
169                                 //    addSingleMapping(prefix, DEFAULT_ARG_SUFFIX, pbArgConceptualizationURI, vnArgConceptualizationURI);
170                             }
171                         }
172                     }
173 
174                 }
175 
176             }
177 
178             LOGGER.debug("Processing {} ...", vnFnMappings);
179             HashMultimap<String, String> vnfnMap = HashMultimap.create();
180             HashMultimap<String, String> vnfnLemmaMap = HashMultimap.create();
181             document = dbf.newDocumentBuilder().parse(vnFnMappings);
182             final Match vnClasses = JOOX.$(document.getElementsByTagName("vncls"));
183 
184             for (Element vnClass : vnClasses) {
185                 String vnCls = vnClass.getAttribute("class");
186                 String lemma = vnClass.getAttribute("vnmember");
187                 String uriLemma = BankConverter.getLemmaFromPredicateName(lemma);
188 
189                 String frame = vnClass.getAttribute("fnframe");
190                 frame = frame.toLowerCase();
191 
192                 vnfnMap.put(vnCls, frame);
193                 
194                 String vnID = vnMap.get(vnCls);
195                 if (vnID == null) {
196                     LOGGER.error("VerbNet ID {} not found", vnCls);
197                     vnID = "INVALID"; // FC: will counted as invalid and dropped later
198 //                    continue;
199                 }
200                 vnID = vnID + "-" + vnCls;
201 
202                 vnfnLemmaMap.put(vnCls + "-" + frame, uriLemma + "|" + vnID);
203                 LOGGER.trace("{} -> {}", vnCls, frame);
204 
205                 Matcher matcher = VN_SC_PATTERN.matcher(vnCls);
206                 while (matcher.find()) {
207                     String newVnCls = matcher.group(1);
208                     vnfnMap.put(newVnCls, frame);
209                     vnfnLemmaMap.put(newVnCls + "-" + frame, uriLemma + "|" + vnID);
210                     LOGGER.trace("{} -> {}", newVnCls, frame);
211                     matcher = VN_SC_PATTERN.matcher(newVnCls);
212                 }
213 
214                 addMapping(fnLinks, vnLinks, uriLemma, frame, vnID);
215             }
216 
217             LOGGER.debug("Processing {} ...", vnFnMappingsRole);
218             int notFound = 0;
219             document = dbf.newDocumentBuilder().parse(vnFnMappingsRole);
220             final Match vnClasses2 = JOOX.$(document.getElementsByTagName("vncls"));
221 
222             for (Element vnClass : vnClasses2) {
223                 String vnCls = vnClass.getAttribute("class");
224                 String frame = vnClass.getAttribute("fnframe");
225 
226                 frame = frame.toLowerCase();
227 
228                 String vnID = vnMap.get(vnCls);
229                 if (vnID == null) {
230                     LOGGER.error("VerbNet ID {} not found", vnCls);
231                     continue;
232                 }
233                 vnID = vnID + "-" + vnCls;
234 
235                 // Check
236                 Set<String> frames = vnfnMap.get(vnCls);
237                 if (!frames.contains(frame)) {
238                     LOGGER.error("Mapping not found: {} -> {}", vnCls, frame);
239                     notFound++;
240                     continue;
241                 }
242 
243                 Set<String> lemmas = vnfnLemmaMap.get(vnCls + "-" + frame);
244                 if (lemmas.size() == 0) {
245                     LOGGER.error("No lemmas for {}", vnCls + "-" + frame);
246                 }
247 
248                 final Match roles = JOOX.$(vnClass.getElementsByTagName("role"));
249                 for (Element role : roles) {
250                     String vnTheta = role.getAttribute("vnrole");
251                     String fnrole = role.getAttribute("fnrole");
252 
253                     vnTheta = vnTheta.toLowerCase();
254                     fnrole = fnrole.toLowerCase();
255 
256                     for (String fnLink : fnLinks) {
257                         for (String vnLink : vnLinks) {
258 
259                             for (String l : lemmas) {
260                                          
261                                 int index = l.indexOf('|');
262                                 String lemma = l.substring(0, index);
263                                 String vnSubClass = l.substring(index + 1);
264                                 
265                                 URI fnFrameURI = uriForRoleset(frame, fnLink);
266                                 URI fnConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, frame, fnLink);
267                                 URI fnArgURI = uriForArgument(frame, fnrole, fnLink);
268 
269                                 URI vnClassURI = uriForRoleset(vnSubClass, vnLink);
270                                 URI vnConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE, vnSubClass, vnLink);
271                                 URI vnArgURI = uriForArgument(vnSubClass, vnTheta, vnLink);
272                                 
273                                 addMappings(vnClassURI, fnFrameURI, vnConceptualizationURI, fnConceptualizationURI, vnArgURI, fnArgURI);
274 
275                                 // todo: Really bad!
276                                 //    String oldArgumentSeparator = argumentSeparator;
277                                 //    argumentSeparator = "@";
278                                 //    URI fnArgConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE,
279                                 //            frame, fnrole, fnLink);
280                                 //    argumentSeparator = oldArgumentSeparator;
281                                 //
282                                 //    URI vnArgConceptualizationURI = uriForConceptualizationWithPrefix(lemma, DEFAULT_TYPE,
283                                 //            vnID, vnTheta, vnLink);
284                                 //
285                                 //    addSingleMapping(prefix, DEFAULT_ARG_SUFFIX, fnArgConceptualizationURI, vnArgConceptualizationURI);
286                             }
287                         }
288                     }
289                 }
290             }
291 
292             LOGGER.info("Roles not mapped: {}", notFound);
293 
294         } catch (final Exception ex) {
295             throw new IOException(ex);
296         }
297     }
298 
299     private void addMapping(ArrayList<String> links1, ArrayList<String> links2, String uriLemma, String p1, String p2) {
300         for (String link1 : links1) {
301             for (String link2 : links2) {
302                 URI firstRolesetURI = uriForRoleset(p1, link1);
303                 URI secondRolesetURI = uriForRoleset(p2, link2);
304                 URI firstConceptualizationURI = uriForConceptualizationWithPrefix(uriLemma, DEFAULT_TYPE, p1, link1);
305                 URI secondConceptualizationURI = uriForConceptualizationWithPrefix(uriLemma, DEFAULT_TYPE, p2, link2);
306                 addMappings(firstRolesetURI, secondRolesetURI, firstConceptualizationURI, secondConceptualizationURI);
307                 // addSingleMapping(null, prefix, DEFAULT_CON_SUFFIX, firstConceptualizationURI, secondConceptualizationURI);
308             }
309         }
310     }
311 
312     @Override protected URI getPosURI(String textualPOS) {
313         return LEXINFO.VERB;
314     }
315 
316     @Override public String getArgLabel() {
317         return "";
318     }
319 
320 }