1   package eu.fbk.dkm.premon.premonitor;
2   
3   import java.io.File;
4   import java.io.IOException;
5   import java.util.List;
6   import java.util.Map;
7   import java.util.Properties;
8   import java.util.Set;
9   import java.util.concurrent.atomic.AtomicInteger;
10  
11  import com.google.common.base.Charsets;
12  import com.google.common.collect.HashMultimap;
13  import com.google.common.collect.Maps;
14  import com.google.common.collect.Multimap;
15  import com.google.common.collect.Sets;
16  import com.google.common.io.Resources;
17  
18  import org.openrdf.model.Resource;
19  import org.openrdf.model.Statement;
20  import org.openrdf.model.URI;
21  import org.openrdf.model.Value;
22  import org.openrdf.model.vocabulary.RDF;
23  import org.openrdf.model.vocabulary.RDFS;
24  import org.openrdf.rio.RDFHandler;
25  import org.openrdf.rio.RDFHandlerException;
26  
27  import eu.fbk.dkm.premon.vocab.FBMETA;
28  import eu.fbk.dkm.premon.vocab.LEXINFO;
29  import eu.fbk.dkm.premon.vocab.PMO;
30  import eu.fbk.rdfpro.AbstractRDFHandler;
31  import eu.fbk.rdfpro.RDFSource;
32  import eu.fbk.rdfpro.RDFSources;
33  import eu.fbk.rdfpro.util.QuadModel;
34  
35  public class FramebaseConverter extends Converter {
36  
37      private static final String FE_NS = "http://framebase.org/fe/";
38  
39      private final List<String> fnPrefixes;
40  
41      private final List<String> pbPrefixes;
42  
43      private final List<String> nbPrefixes;
44  
45      public FramebaseConverter(final File path, final RDFHandler sink, final Properties properties,
46              final Map<String, URI> wnInfo) {
47  
48          super(path, properties.getProperty("source"), sink, properties,
49                  properties.getProperty("language"), wnInfo);
50  
51          this.fnPrefixes = parseLinks(properties.getProperty("linkfn"));
52          this.pbPrefixes = parseLinks(properties.getProperty("linkpb"));
53          this.nbPrefixes = parseLinks(properties.getProperty("linknb"));
54      }
55  
56      @Override
57      protected URI getPosURI(final String textualPOS) {
58          if (textualPOS == null) {
59              return null;
60          }
61          // Missing: pronoun (PRON=lexinfo:pronoun), cardinal number (NUM=lexinfo:CardinalNumber)
62          switch (textualPOS.toLowerCase()) {
63          case "adjective":
64              return LEXINFO.ADJECTIVE;
65          case "conjunction":
66              return LEXINFO.CONJUNCTION;
67          case "interjection":
68              return LEXINFO.INTERJECTION;
69          case "preposition":
70              return LEXINFO.PREPOSITION;
71          case "verb":
72              return LEXINFO.VERB;
73          case "determiner":
74              return LEXINFO.DETERMINER;
75          case "noun":
76              return LEXINFO.NOUN;
77          case "subordinate_conjunction":
78              return LEXINFO.SUBORDINATING_CONJUNCTION;
79          case "adverb":
80              return LEXINFO.ADVERB;
81          default:
82              LOGGER.error("POS not found: {}", textualPOS);
83              return null;
84          }
85      }
86  
87      private URI getPosURIfromFramebase(final String pos, final String lemma, final String clazz) {
88          if (clazz.equalsIgnoreCase("cardinal_numbers") && !lemma.equalsIgnoreCase("score")
89                  && !lemma.equalsIgnoreCase("brace") && !lemma.equalsIgnoreCase("couple")
90                  && !lemma.equalsIgnoreCase("fourteen") && !lemma.equalsIgnoreCase("dual")
91                  && !lemma.equalsIgnoreCase("pair")) {
92              return LEXINFO.CARDINAL_NUMERAL;
93          }
94          return getPosURI(pos);
95      }
96  
97      @Override
98      public void convert() throws IOException, RDFHandlerException {
99  
100         // Load selected FrameBase (FB) triples
101         final QuadModel model = readFramebaseTriples();
102 
103         // Emit FN -> FrameBase alignments, deriving them exclusively from FB data
104         emitFNAlignments(model);
105 
106         // Emit PB/NB -> FrameBase alignments, deriving them from FB and embedded mapping data
107         emitPBNBAlignments(model);
108 
109         //LOGGER.info("Framebase Total alignments: "+model.filter(null,PMO.ONTO_MATCH,null).size());
110     }
111 
112     private QuadModel readFramebaseTriples() throws IOException {
113         final QuadModel model = QuadModel.create();
114         for (final File file : this.path.listFiles()) {
115             try {
116                 final AtomicInteger counter = new AtomicInteger();
117                 final RDFSource source = RDFSources.read(false, true, null, null,
118                         file.getAbsolutePath());
119                 source.emit(new AbstractRDFHandler() {
120 
121                     @Override
122                     public void handleStatement(final Statement stmt) throws RDFHandlerException {
123                         final URI p = stmt.getPredicate();
124                         final Value o = stmt.getObject();
125                         if (p.equals(RDFS.SUBCLASSOF) || p.equals(RDFS.DOMAIN)
126                                 || p.equals(FBMETA.HAS_FRAMENET_FE) || o.equals(FBMETA.MACROFRAME)
127                                 || o.equals(FBMETA.MINIFRAME) || o.equals(FBMETA.LU_MICROFRAME)) {
128                             model.add(stmt);
129                         }
130                         counter.incrementAndGet();
131                     }
132 
133                 }, 1);
134                 LOGGER.info("{} triples read from {}", counter, file);
135             } catch (final RDFHandlerException ex) {
136                 throw new IOException(ex);
137             }
138         }
139         return model;
140     }
141 
142     private void emitFNAlignments(final QuadModel model) {
143 
144 
145         int conCount=0;
146         int roleCount=0;
147         // Emit mappings for LU microframes
148         LOGGER.info("Emitting FN frame -> FB class alignments");
149         for (final Resource s : model.filter(null, RDF.TYPE, FBMETA.LU_MICROFRAME).subjects()) {
150             final URI luMicroframe = (URI) s;
151             final String[] tokens = luMicroframe.getLocalName().split("\\.");
152             assert tokens.length == 3;
153             final String frame = tokens[0].toLowerCase();
154             final String lemma = fixFramebaseLemma(tokens[1]);
155             final String pos = tokens[2].toLowerCase();
156             for (final String fnPrefix : this.fnPrefixes) {
157                 final URI fnCon = uriForConceptualization(fnPrefix, lemma,
158                         getPosURIfromFramebase(pos, lemma, frame), frame);
159                 addStatementToSink(fnCon, PMO.ONTO_MATCH, luMicroframe);
160                 addStatementToSink(fnCon, RDF.TYPE, PMO.CONCEPTUALIZATION);
161                 conCount++;
162             }
163         }
164         LOGGER.info("Alignments found: "+conCount);
165 
166         // Retrieve the most specific macroframes
167         final Set<Resource> macroframes = Sets.newHashSet();
168         macroframes.addAll(model.filter(null, RDF.TYPE, FBMETA.MACROFRAME).subjects());
169         macroframes.removeAll(model.filter(null, RDF.TYPE, FBMETA.MINIFRAME).subjects());
170 
171         // Emit mappings for arguments associated to macroframes
172         LOGGER.info("Emitting FN frame element -> FB property alignments");
173         for (final Resource f : macroframes) {
174             for (final Resource p : model.filter(null, RDFS.DOMAIN, f).subjects()) {
175                 final URI property = (URI) p;
176                 final String[] tokens = property.stringValue().substring(FE_NS.length())
177                         .toLowerCase().split("\\.");
178                 assert tokens.length == 2;
179                 final String frame = tokens[0];
180                 final String role = tokens[1].replace("has_", "").replace('+', '_');
181                 for (final String fnPrefix : this.fnPrefixes) {
182                     final URI fnArg = uriForSemanticRole(fnPrefix, frame, role);
183                     addStatementToSink(fnArg, PMO.ONTO_MATCH, property);
184                     addStatementToSink(fnArg, RDF.TYPE, PMO.SEMANTIC_ROLE);
185                     roleCount++;
186                 }
187             }
188         }
189         LOGGER.info("Alignments found: "+roleCount);
190     }
191 
192     private void emitPBNBAlignments(final QuadModel model) throws IOException {
193 
194         int conCount=0;
195         int roleCount=0;
196 
197 //        final Map<String, String> multiwordLemmas = Maps.newHashMap();
198 //        for (final String line : Resources.readLines(
199 //                FramebaseConverter.class.getResource("pb215-multiwords.tsv"), Charsets.UTF_8)) {
200 //            final String[] fields = line.split("\t");
201 //            multiwordLemmas.put(fields[0], fields[1]);
202 //        }
203 
204         final Multimap<String, URI> luMicroframes = HashMultimap.create();
205         for (final Resource s : model.filter(null, RDF.TYPE, FBMETA.LU_MICROFRAME).subjects()) {
206             final String[] tokens = ((URI) s).getLocalName().toLowerCase().split("\\.");
207             final String frame = tokens[0];
208             final String lemma = fixFramebaseLemma(tokens[1]);
209             luMicroframes.put(frame + "-" + lemma, (URI) s);
210         }
211 
212         LOGGER.info("Emitting PB/NB roleset -> FB class alignments");
213         final Map<String, String> rolesetFrames = Maps.newHashMap();
214         for (final String line : Resources.readLines(
215                 FramebaseConverter.class.getResource("fn-class-mappings.tsv"), Charsets.UTF_8)) {
216 
217             final String[] fields = line.toLowerCase().split("\t");
218             final int index1 = fields[0].indexOf(':');
219             final int index2 = fields[0].lastIndexOf('.');
220             final String bank = fields[0].substring(0, index1);
221             final List<String> prefixes = "pb".equals(bank) ? this.pbPrefixes
222                     : "nb".equals(bank) ? this.nbPrefixes : null;
223             final String roleset = fields[0].substring(index1 + 1).replace(".lv", ".LV");
224             final String lemma = fields[0].substring(index1 + 1, index2);
225             final String frame = fields[1];
226             rolesetFrames.put(fields[0], fields[1]);
227 
228             URI luMicroframe = null;
229             String pos = null;
230             for (final URI candidate : luMicroframes.get(frame + "-" + lemma)) {
231                 final String str = candidate.stringValue();
232                 if ("nb".equals(bank) && str.endsWith(".noun")
233                         || "pb".equals(bank) && (luMicroframe == null || str.endsWith(".verb"))) {
234                     luMicroframe = candidate;
235                     pos = str.substring(str.lastIndexOf('.') + 1);
236                 }
237             }
238 
239             if (luMicroframe == null) {
240                 LOGGER.warn("Could not find matching LU Microframe class for " + line);
241                 continue;
242             }
243 
244             for (final String prefix : prefixes) {
245                 //final String expandedLemma = prefix.startsWith("pb")
246                 //        ? multiwordLemmas.getOrDefault(roleset, lemma) : lemma;
247                 final URI pred = uriForSemanticClass(prefix, roleset);
248                 final URI con = uriForConceptualization(prefix, lemma,
249                         getPosURIfromFramebase(pos, lemma, frame), roleset);
250                 addStatementToSink(pred, PMO.ONTO_MATCH, luMicroframe);
251                 addStatementToSink(pred, RDF.TYPE, PMO.SEMANTIC_CLASS);
252                 addStatementToSink(con, PMO.ONTO_MATCH, luMicroframe);
253                 addStatementToSink(con, RDF.TYPE, PMO.CONCEPTUALIZATION);
254                 conCount+=2;
255             }
256         }
257         LOGGER.info("Alignments found: "+conCount);
258 
259         final Map<String, URI> properties = Maps.newHashMap();
260         for (final Resource s : model.filter(null, FBMETA.HAS_FRAMENET_FE, null).subjects()) {
261             final String name = s.stringValue().substring(FE_NS.length()).toLowerCase()
262                     .replace(".has_", ".").replace('+', '_');
263             properties.put(name, (URI) s);
264         }
265 
266         LOGGER.info("Emitting PB/NB role -> FB property alignments");
267         for (final String line : Resources.readLines(
268                 FramebaseConverter.class.getResource("fn-role-mappings.tsv"), Charsets.UTF_8)) {
269 
270             final String[] fields = line.toLowerCase().split("\t");
271             final int index = fields[0].indexOf(':');
272             final String bank = fields[0].substring(0, index);
273             final String roleset = fields[0].substring(index + 1).replace(".lv", ".LV");
274             final List<String> prefixes = "pb".equals(bank) ? this.pbPrefixes : this.nbPrefixes;
275             final String role = fields[1];
276             final String frame = rolesetFrames.get(fields[0]);
277             final String fe = fields[2];
278 
279             if (frame == null) {
280                 LOGGER.error("Could not find FN frame for " + line);
281                 continue;
282             }
283 
284             final URI property = properties.get(frame + "." + fe);
285             if (property == null) {
286                 LOGGER.warn("Could not find matching property for " + line);
287                 continue;
288             }
289 
290             for (final String prefix : prefixes) {
291                 final URI arg = uriForSemanticRole(prefix, roleset, role);
292                 addStatementToSink(arg, PMO.ONTO_MATCH, property);
293                 addStatementToSink(arg, RDF.TYPE, PMO.SEMANTIC_ROLE);
294                 roleCount++;
295             }
296         }
297         LOGGER.info("Alignments found: "+roleCount);
298     }
299 
300     private static URI uriForSemanticClass(final String prefix, final String clazz) {
301         final StringBuilder builder = new StringBuilder();
302         builder.append(NAMESPACE);
303         builder.append(prefix);
304         builder.append("-");
305         builder.append(clazz);
306         return createURI(builder.toString());
307     }
308 
309     private static URI uriForConceptualization(final String prefix, final String lemma,
310             final URI pos, final String clazz) {
311         final StringBuilder builder = new StringBuilder();
312         builder.append(NAMESPACE);
313         builder.append(CONCEPTUALIZATION_PREFIX);
314         builder.append("-");
315         builder.append(LEXINFO.map.get(pos));
316         builder.append("-");
317         builder.append(lemma.equals("%") ? "perc-sign" : lemma.replaceAll("[^a-zA-Z0-9-_+]", ""));
318         builder.append("-");
319         builder.append(prefix);
320         builder.append("-");
321         builder.append(clazz);
322         return createURI(builder.toString());
323     }
324 
325     private URI uriForSemanticRole(final String prefix, final String clazz,
326             final String role) {
327         final StringBuilder builder = new StringBuilder();
328         builder.append(NAMESPACE);
329         builder.append(prefix.toLowerCase());
330         builder.append("-");
331         builder.append(clazz);
332         if (prefix.startsWith("fn")) {
333             builder.append(argumentSeparator).append(role.toLowerCase());
334         } else if (prefix.startsWith("pb") || prefix.startsWith("nb")) {
335             builder.append(argumentSeparator).append("arg").append(role.toLowerCase());
336         } else {
337             throw new UnsupportedOperationException();
338         }
339         return createURI(builder.toString());
340     }
341 
342     private static String fixFramebaseLemma(final String lemma) {
343         // TODO this is a hack
344         if (lemma.equals("nom+de+plume")) {
345             return "nomdeplume";
346         } else if (lemma.equals("nom+de+guerre")) {
347             return "nomdeguerre";
348         } else {
349             return lemma;
350         }
351     }
352 
353 }