1 package eu.fbk.dkm.premon.premonitor;
2
3 import java.io.File;
4 import java.io.IOException;
5 import java.util.List;
6 import java.util.Map;
7 import java.util.Properties;
8 import java.util.Set;
9 import java.util.concurrent.atomic.AtomicInteger;
10
11 import com.google.common.base.Charsets;
12 import com.google.common.collect.HashMultimap;
13 import com.google.common.collect.Maps;
14 import com.google.common.collect.Multimap;
15 import com.google.common.collect.Sets;
16 import com.google.common.io.Resources;
17
18 import org.openrdf.model.Resource;
19 import org.openrdf.model.Statement;
20 import org.openrdf.model.URI;
21 import org.openrdf.model.Value;
22 import org.openrdf.model.vocabulary.RDF;
23 import org.openrdf.model.vocabulary.RDFS;
24 import org.openrdf.rio.RDFHandler;
25 import org.openrdf.rio.RDFHandlerException;
26
27 import eu.fbk.dkm.premon.vocab.FBMETA;
28 import eu.fbk.dkm.premon.vocab.LEXINFO;
29 import eu.fbk.dkm.premon.vocab.PMO;
30 import eu.fbk.rdfpro.AbstractRDFHandler;
31 import eu.fbk.rdfpro.RDFSource;
32 import eu.fbk.rdfpro.RDFSources;
33 import eu.fbk.rdfpro.util.QuadModel;
34
35 public class FramebaseConverter extends Converter {
36
37 private static final String FE_NS = "http://framebase.org/fe/";
38
39 private final List<String> fnPrefixes;
40
41 private final List<String> pbPrefixes;
42
43 private final List<String> nbPrefixes;
44
45 public FramebaseConverter(final File path, final RDFHandler sink, final Properties properties,
46 final Map<String, URI> wnInfo) {
47
48 super(path, properties.getProperty("source"), sink, properties,
49 properties.getProperty("language"), wnInfo);
50
51 this.fnPrefixes = parseLinks(properties.getProperty("linkfn"));
52 this.pbPrefixes = parseLinks(properties.getProperty("linkpb"));
53 this.nbPrefixes = parseLinks(properties.getProperty("linknb"));
54 }
55
56 @Override
57 protected URI getPosURI(final String textualPOS) {
58 if (textualPOS == null) {
59 return null;
60 }
61
62 switch (textualPOS.toLowerCase()) {
63 case "adjective":
64 return LEXINFO.ADJECTIVE;
65 case "conjunction":
66 return LEXINFO.CONJUNCTION;
67 case "interjection":
68 return LEXINFO.INTERJECTION;
69 case "preposition":
70 return LEXINFO.PREPOSITION;
71 case "verb":
72 return LEXINFO.VERB;
73 case "determiner":
74 return LEXINFO.DETERMINER;
75 case "noun":
76 return LEXINFO.NOUN;
77 case "subordinate_conjunction":
78 return LEXINFO.SUBORDINATING_CONJUNCTION;
79 case "adverb":
80 return LEXINFO.ADVERB;
81 default:
82 LOGGER.error("POS not found: {}", textualPOS);
83 return null;
84 }
85 }
86
87 private URI getPosURIfromFramebase(final String pos, final String lemma, final String clazz) {
88 if (clazz.equalsIgnoreCase("cardinal_numbers") && !lemma.equalsIgnoreCase("score")
89 && !lemma.equalsIgnoreCase("brace") && !lemma.equalsIgnoreCase("couple")
90 && !lemma.equalsIgnoreCase("fourteen") && !lemma.equalsIgnoreCase("dual")
91 && !lemma.equalsIgnoreCase("pair")) {
92 return LEXINFO.CARDINAL_NUMERAL;
93 }
94 return getPosURI(pos);
95 }
96
97 @Override
98 public void convert() throws IOException, RDFHandlerException {
99
100
101 final QuadModel model = readFramebaseTriples();
102
103
104 emitFNAlignments(model);
105
106
107 emitPBNBAlignments(model);
108
109
110 }
111
112 private QuadModel readFramebaseTriples() throws IOException {
113 final QuadModel model = QuadModel.create();
114 for (final File file : this.path.listFiles()) {
115 try {
116 final AtomicInteger counter = new AtomicInteger();
117 final RDFSource source = RDFSources.read(false, true, null, null,
118 file.getAbsolutePath());
119 source.emit(new AbstractRDFHandler() {
120
121 @Override
122 public void handleStatement(final Statement stmt) throws RDFHandlerException {
123 final URI p = stmt.getPredicate();
124 final Value o = stmt.getObject();
125 if (p.equals(RDFS.SUBCLASSOF) || p.equals(RDFS.DOMAIN)
126 || p.equals(FBMETA.HAS_FRAMENET_FE) || o.equals(FBMETA.MACROFRAME)
127 || o.equals(FBMETA.MINIFRAME) || o.equals(FBMETA.LU_MICROFRAME)) {
128 model.add(stmt);
129 }
130 counter.incrementAndGet();
131 }
132
133 }, 1);
134 LOGGER.info("{} triples read from {}", counter, file);
135 } catch (final RDFHandlerException ex) {
136 throw new IOException(ex);
137 }
138 }
139 return model;
140 }
141
142 private void emitFNAlignments(final QuadModel model) {
143
144
145 int conCount=0;
146 int roleCount=0;
147
148 LOGGER.info("Emitting FN frame -> FB class alignments");
149 for (final Resource s : model.filter(null, RDF.TYPE, FBMETA.LU_MICROFRAME).subjects()) {
150 final URI luMicroframe = (URI) s;
151 final String[] tokens = luMicroframe.getLocalName().split("\\.");
152 assert tokens.length == 3;
153 final String frame = tokens[0].toLowerCase();
154 final String lemma = fixFramebaseLemma(tokens[1]);
155 final String pos = tokens[2].toLowerCase();
156 for (final String fnPrefix : this.fnPrefixes) {
157 final URI fnCon = uriForConceptualization(fnPrefix, lemma,
158 getPosURIfromFramebase(pos, lemma, frame), frame);
159 addStatementToSink(fnCon, PMO.ONTO_MATCH, luMicroframe);
160 addStatementToSink(fnCon, RDF.TYPE, PMO.CONCEPTUALIZATION);
161 conCount++;
162 }
163 }
164 LOGGER.info("Alignments found: "+conCount);
165
166
167 final Set<Resource> macroframes = Sets.newHashSet();
168 macroframes.addAll(model.filter(null, RDF.TYPE, FBMETA.MACROFRAME).subjects());
169 macroframes.removeAll(model.filter(null, RDF.TYPE, FBMETA.MINIFRAME).subjects());
170
171
172 LOGGER.info("Emitting FN frame element -> FB property alignments");
173 for (final Resource f : macroframes) {
174 for (final Resource p : model.filter(null, RDFS.DOMAIN, f).subjects()) {
175 final URI property = (URI) p;
176 final String[] tokens = property.stringValue().substring(FE_NS.length())
177 .toLowerCase().split("\\.");
178 assert tokens.length == 2;
179 final String frame = tokens[0];
180 final String role = tokens[1].replace("has_", "").replace('+', '_');
181 for (final String fnPrefix : this.fnPrefixes) {
182 final URI fnArg = uriForSemanticRole(fnPrefix, frame, role);
183 addStatementToSink(fnArg, PMO.ONTO_MATCH, property);
184 addStatementToSink(fnArg, RDF.TYPE, PMO.SEMANTIC_ROLE);
185 roleCount++;
186 }
187 }
188 }
189 LOGGER.info("Alignments found: "+roleCount);
190 }
191
192 private void emitPBNBAlignments(final QuadModel model) throws IOException {
193
194 int conCount=0;
195 int roleCount=0;
196
197
198
199
200
201
202
203
204 final Multimap<String, URI> luMicroframes = HashMultimap.create();
205 for (final Resource s : model.filter(null, RDF.TYPE, FBMETA.LU_MICROFRAME).subjects()) {
206 final String[] tokens = ((URI) s).getLocalName().toLowerCase().split("\\.");
207 final String frame = tokens[0];
208 final String lemma = fixFramebaseLemma(tokens[1]);
209 luMicroframes.put(frame + "-" + lemma, (URI) s);
210 }
211
212 LOGGER.info("Emitting PB/NB roleset -> FB class alignments");
213 final Map<String, String> rolesetFrames = Maps.newHashMap();
214 for (final String line : Resources.readLines(
215 FramebaseConverter.class.getResource("fn-class-mappings.tsv"), Charsets.UTF_8)) {
216
217 final String[] fields = line.toLowerCase().split("\t");
218 final int index1 = fields[0].indexOf(':');
219 final int index2 = fields[0].lastIndexOf('.');
220 final String bank = fields[0].substring(0, index1);
221 final List<String> prefixes = "pb".equals(bank) ? this.pbPrefixes
222 : "nb".equals(bank) ? this.nbPrefixes : null;
223 final String roleset = fields[0].substring(index1 + 1).replace(".lv", ".LV");
224 final String lemma = fields[0].substring(index1 + 1, index2);
225 final String frame = fields[1];
226 rolesetFrames.put(fields[0], fields[1]);
227
228 URI luMicroframe = null;
229 String pos = null;
230 for (final URI candidate : luMicroframes.get(frame + "-" + lemma)) {
231 final String str = candidate.stringValue();
232 if ("nb".equals(bank) && str.endsWith(".noun")
233 || "pb".equals(bank) && (luMicroframe == null || str.endsWith(".verb"))) {
234 luMicroframe = candidate;
235 pos = str.substring(str.lastIndexOf('.') + 1);
236 }
237 }
238
239 if (luMicroframe == null) {
240 LOGGER.warn("Could not find matching LU Microframe class for " + line);
241 continue;
242 }
243
244 for (final String prefix : prefixes) {
245
246
247 final URI pred = uriForSemanticClass(prefix, roleset);
248 final URI con = uriForConceptualization(prefix, lemma,
249 getPosURIfromFramebase(pos, lemma, frame), roleset);
250 addStatementToSink(pred, PMO.ONTO_MATCH, luMicroframe);
251 addStatementToSink(pred, RDF.TYPE, PMO.SEMANTIC_CLASS);
252 addStatementToSink(con, PMO.ONTO_MATCH, luMicroframe);
253 addStatementToSink(con, RDF.TYPE, PMO.CONCEPTUALIZATION);
254 conCount+=2;
255 }
256 }
257 LOGGER.info("Alignments found: "+conCount);
258
259 final Map<String, URI> properties = Maps.newHashMap();
260 for (final Resource s : model.filter(null, FBMETA.HAS_FRAMENET_FE, null).subjects()) {
261 final String name = s.stringValue().substring(FE_NS.length()).toLowerCase()
262 .replace(".has_", ".").replace('+', '_');
263 properties.put(name, (URI) s);
264 }
265
266 LOGGER.info("Emitting PB/NB role -> FB property alignments");
267 for (final String line : Resources.readLines(
268 FramebaseConverter.class.getResource("fn-role-mappings.tsv"), Charsets.UTF_8)) {
269
270 final String[] fields = line.toLowerCase().split("\t");
271 final int index = fields[0].indexOf(':');
272 final String bank = fields[0].substring(0, index);
273 final String roleset = fields[0].substring(index + 1).replace(".lv", ".LV");
274 final List<String> prefixes = "pb".equals(bank) ? this.pbPrefixes : this.nbPrefixes;
275 final String role = fields[1];
276 final String frame = rolesetFrames.get(fields[0]);
277 final String fe = fields[2];
278
279 if (frame == null) {
280 LOGGER.error("Could not find FN frame for " + line);
281 continue;
282 }
283
284 final URI property = properties.get(frame + "." + fe);
285 if (property == null) {
286 LOGGER.warn("Could not find matching property for " + line);
287 continue;
288 }
289
290 for (final String prefix : prefixes) {
291 final URI arg = uriForSemanticRole(prefix, roleset, role);
292 addStatementToSink(arg, PMO.ONTO_MATCH, property);
293 addStatementToSink(arg, RDF.TYPE, PMO.SEMANTIC_ROLE);
294 roleCount++;
295 }
296 }
297 LOGGER.info("Alignments found: "+roleCount);
298 }
299
300 private static URI uriForSemanticClass(final String prefix, final String clazz) {
301 final StringBuilder builder = new StringBuilder();
302 builder.append(NAMESPACE);
303 builder.append(prefix);
304 builder.append("-");
305 builder.append(clazz);
306 return createURI(builder.toString());
307 }
308
309 private static URI uriForConceptualization(final String prefix, final String lemma,
310 final URI pos, final String clazz) {
311 final StringBuilder builder = new StringBuilder();
312 builder.append(NAMESPACE);
313 builder.append(CONCEPTUALIZATION_PREFIX);
314 builder.append("-");
315 builder.append(LEXINFO.map.get(pos));
316 builder.append("-");
317 builder.append(lemma.equals("%") ? "perc-sign" : lemma.replaceAll("[^a-zA-Z0-9-_+]", ""));
318 builder.append("-");
319 builder.append(prefix);
320 builder.append("-");
321 builder.append(clazz);
322 return createURI(builder.toString());
323 }
324
325 private URI uriForSemanticRole(final String prefix, final String clazz,
326 final String role) {
327 final StringBuilder builder = new StringBuilder();
328 builder.append(NAMESPACE);
329 builder.append(prefix.toLowerCase());
330 builder.append("-");
331 builder.append(clazz);
332 if (prefix.startsWith("fn")) {
333 builder.append(argumentSeparator).append(role.toLowerCase());
334 } else if (prefix.startsWith("pb") || prefix.startsWith("nb")) {
335 builder.append(argumentSeparator).append("arg").append(role.toLowerCase());
336 } else {
337 throw new UnsupportedOperationException();
338 }
339 return createURI(builder.toString());
340 }
341
342 private static String fixFramebaseLemma(final String lemma) {
343
344 if (lemma.equals("nom+de+plume")) {
345 return "nomdeplume";
346 } else if (lemma.equals("nom+de+guerre")) {
347 return "nomdeguerre";
348 } else {
349 return lemma;
350 }
351 }
352
353 }