1   package eu.fbk.dkm.premon.premonitor;
2   
3   import com.google.common.io.Files;
4   import eu.fbk.dkm.premon.vocab.LEXINFO;
5   import eu.fbk.dkm.premon.vocab.PMO;
6   import org.apache.commons.csv.CSVFormat;
7   import org.apache.commons.csv.CSVParser;
8   import org.apache.commons.csv.CSVRecord;
9   import org.joox.JOOX;
10  import org.joox.Match;
11  import org.openrdf.model.URI;
12  import org.openrdf.model.vocabulary.RDF;
13  import org.openrdf.model.vocabulary.RDFS;
14  import org.openrdf.rio.RDFHandler;
15  import org.w3c.dom.Document;
16  import org.w3c.dom.Element;
17  
18  import javax.xml.parsers.DocumentBuilderFactory;
19  import java.io.File;
20  import java.io.IOException;
21  import java.nio.charset.Charset;
22  import java.util.*;
23  import java.util.regex.Matcher;
24  import java.util.regex.Pattern;
25  
26  public class PredMatConverter extends Converter {
27  
28  	final private int ID_LANG = 1 -1;																					// --> numeric identifier for better readability ID_LANG = 1_ID_LANG
29  	final private int VN_SUBCLASS = 7 -1;
30  	final private int VN_CLASS = 5 -1;
31  	final private int VN_LEMA = 9 -1;
32  	final private int VN_ROLE = 10 -1;
33  	final private int FN_FRAME = 13 -1;
34  	final private int FN_LE = 14 -1;
35  	final private int FN_FRAME_ELEMENT = 15 -1;
36  	final private int PB_ROLESET = 16 -1;
37  	final private int PB_ARG = 17 -1;
38  	final private int WN_SENSE = 11 -1;																					// numeric identifier -->
39  
40  	final private String FILE_NAME = "PredicateMatrix.v1.3.txt"; 														// File name of DB
41  
42  	private static final Pattern VN_PATTERN = Pattern.compile("([^-]+)-(.*)");
43  	private static final Pattern WN_PATTERN = Pattern.compile("#([^#]+)$");
44  
45  	private static final String DEFAULT_TYPE = "v";
46  
47  	private Map<String, String> vnMap = new HashMap<>();
48  
49  	private ArrayList<String> vnLinks = new ArrayList<>();
50  	private ArrayList<String> fnLinks = new ArrayList<>();
51  	private ArrayList<String> pbLinks = new ArrayList<>();
52  
53  	protected Set entries = new HashSet();
54  
55  	public PredMatConverter(File path, RDFHandler sink, Properties properties, Map<String, URI> wnInfo){
56  		super(path, properties.getProperty("source"), sink, properties, properties.getProperty("language"), wnInfo);
57  
58  
59  		addLinks(pbLinks, properties.getProperty("linkpb"));															// --> set links to DBs
60  		addLinks(fnLinks, properties.getProperty("linkfn"));
61  		addLinks(vnLinks, properties.getProperty("linkvn"));															// set links to DBs -->
62  
63  		String vnPath = properties.getProperty("vnpath");																// --> Import HashMap for VerbNet ID
64  		if (vnPath != null) {
65  			LOGGER.info("Loading VerbNet");
66  			File vnFile = new File(vnPath);
67  			if (vnFile.exists() && vnFile.isDirectory()) {
68  				final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
69  
70  				for (final File file : Files.fileTreeTraverser().preOrderTraversal(vnFile)) {
71  					if (!file.isDirectory() && file.getName().endsWith(".xml")) {
72  						LOGGER.debug("Processing {} ...", file);
73  
74  						try {
75  							final Document document = dbf.newDocumentBuilder().parse(file);
76  							final Match vnClass = JOOX.$(document.getElementsByTagName("VNCLASS"))
77  									.add(JOOX.$(document.getElementsByTagName("VNSUBCLASS")));
78  
79  							for (Element thisClass : vnClass) {
80  								String id = thisClass.getAttribute("ID");
81  								Matcher mID = VN_PATTERN.matcher(id);
82  								if (mID.find()) {
83  									vnMap.put(mID.group(2), mID.group(1));
84  								} else {
85  									LOGGER.error("Unable to parse {}", id);
86  								}
87  							}
88  
89  						} catch (final Exception ex) {
90  							ex.printStackTrace();
91  						}
92  					}
93  				}
94  
95  			}
96  		}																												// Import HashMap for VerbNet ID -->
97  
98  		LOGGER.info("Links to: {}", pbLinks.toString());
99  		LOGGER.info("Links to: {}", vnLinks.toString());
100 		LOGGER.info("Links to: {}", fnLinks.toString());
101 		LOGGER.info("Starting dataset: {}", prefix);
102 	}
103 
104 	@Override public void convert() throws IOException {
105 
106 		addMetaToSink();																								// Standard data to sink
107 
108 		File PredMat = new File(this.path + File.separator + FILE_NAME);												// Open Predicate Matrix File
109 
110 		try{
111 
112 			List<URI> classes = new ArrayList<URI>();
113 			List<URI> conceptualizations = new ArrayList<URI>();
114 			List<URI> arguments = new ArrayList<URI>();
115 
116 			CSVParser parser = CSVParser.parse(PredMat, Charset.defaultCharset(), CSVFormat.TDF.withSkipHeaderRecord());// Parse the csv File with tabs instead of commas and exclude header
117 
118 			for(CSVRecord pme : parser) {
119 
120 				String lang = pme.get(ID_LANG); 																		// --> Filter non english languages
121 				if(lang.compareToIgnoreCase("id:eng")!=0){
122 					continue;
123 				}																										// Filter non english languages -->
124 
125 				String vnSc = pme.get(VN_SUBCLASS).compareToIgnoreCase("vn:null") == 0?									// --> get needed data
126 						pme.get(VN_CLASS) : pme.get(VN_SUBCLASS);														// If sublcass = null take class instead
127 				String vnLe = pme.get(VN_LEMA);
128 				String vnSr = pme.get(VN_ROLE);
129 				vnSr = vnSr.toLowerCase();
130 
131 				String fnSc = pme.get(FN_FRAME);
132 				fnSc = fnSc.toLowerCase();
133 				String fnLe = pme.get(FN_LE);
134 				fnLe = fnLe.compareToIgnoreCase("fn:null") == 0?fnLe:fnLe.substring(0, fnLe.length()-2);				// fn:****.v => fn:**** (remove the ".v") if is != null
135 				String fnSr = pme.get(FN_FRAME_ELEMENT);
136 				fnSr = fnSr.toLowerCase();
137 
138 				String pbSc = pme.get(PB_ROLESET);
139 				String pbSr = pme.get(PB_ARG);
140 				pbSr = pbSr.toLowerCase();
141 
142 				String wnSense = pme.get(WN_SENSE);																		// get needed data -->
143 
144 				vnSc = removeNameSpace(vnSc); vnLe = removeNameSpace(vnLe); vnSr = removeNameSpace(vnSr);				// --> Removing Namespaces vn:abate => abate or if vn:null => java null
145 				fnSc = removeNameSpace(fnSc); fnLe = removeNameSpace(fnLe); fnSr = removeNameSpace(fnSr);
146 				pbSc = removeNameSpace(pbSc); pbSr = removeNameSpace(pbSr);
147 				wnSense = removeNameSpace(wnSense);																		// Removing Namespaces -->
148 
149 				pbSr = pbSr == null? null : "arg" + pbSr;																// Add "arg" to pbSr 1 => arg1
150 
151 				String hash = vnSc + vnLe + vnSr + fnSc + fnLe + fnSr + pbSc + pbSr + wnSense; 							// --> Duplicate Check
152 				if(!entries.add(hash.hashCode())){
153 					notadded++;
154 					total++;
155 					continue;
156 				}
157 				added++;
158 				total++;																								// Duplicate Check -->
159 
160 				for (String vnLink : vnLinks) {																			// --> Adding data to "sink"
161 					for (String fnLink : fnLinks) {
162 						for (String pbLink : pbLinks) {
163 							String vnID = vnSc == null? null : getVnID(vnSc);
164 							URI vnClassURI = vnID == null? null : uriForRoleset(vnID, vnLink);
165 							URI vnConceptualizationURI = vnLe == null || vnID == null?
166 									null : uriForConceptualizationWithPrefix(vnLe, DEFAULT_TYPE, vnID, vnLink);
167 							URI vnArgURI = vnSr == null || vnID == null? null : uriForArgument(vnID, vnSr, vnLink);
168 
169 							URI fnFrameURI = fnSc == null? null : uriForRoleset(fnSc, fnLink);
170 							URI fnConceptualizationURI = fnLe == null || fnSc == null?
171 									null : uriForConceptualizationWithPrefix(fnLe, DEFAULT_TYPE, fnSc, fnLink);
172 							URI fnArgURI = fnSr == null || fnSc == null? null : uriForArgument(fnSc, fnSr, fnLink);
173 
174 							URI pbRolesetURI = pbSc == null? null : uriForRoleset(pbSc, pbLink);
175 							URI pbConceptualizationURI = pbSc == null?
176 									null : uriForConceptualizationWithPrefix(pbSc.substring(0, pbSc.indexOf(".")),
177 													DEFAULT_TYPE, pbSc, pbLink);
178 							URI pbArgURI = pbSr == null || pbSc == null? null : uriForArgument(pbSc, pbSr, pbLink);
179 
180 							URI wnSenseURI = wnSense == null?
181 									null : uriForWnSense(wnSense, wnSense.substring(0,wnSense.indexOf("%")));
182 
183 
184 							if(vnClassURI != null){
185 								classes.add(vnClassURI);
186 							}if(fnFrameURI != null){
187 								classes.add(fnFrameURI);
188 							}if(pbRolesetURI != null){
189 								classes.add(pbRolesetURI);
190 							}
191 
192 							if(vnConceptualizationURI != null){
193 								conceptualizations.add(vnConceptualizationURI);
194 							}if(fnConceptualizationURI != null){
195 								conceptualizations.add(fnConceptualizationURI);
196 							}if(pbConceptualizationURI != null){
197 								conceptualizations.add(pbConceptualizationURI);
198 							}if(wnSenseURI != null) {
199 								conceptualizations.add(wnSenseURI);
200 							}
201 
202 							if(vnArgURI != null){
203 								arguments.add(vnArgURI);
204 							}if(fnArgURI != null){
205 								arguments.add(fnArgURI);
206 							}if(pbArgURI != null){
207 								arguments.add(pbArgURI);
208 							}
209 
210 							addMappings(classes, conceptualizations, arguments);
211 
212 							classes.clear();
213 							conceptualizations.clear();
214 							arguments.clear();
215 						}
216 					}
217 				}																										// Adding data to "sink" -->
218 
219 			}
220 
221 			LOGGER.info("Element added: " + added + ", not added: " + notadded + " of " + total);
222 			LOGGER.info("Class mappings: {}, Conceptualization mappings: {}, Role mappings: {}", nclass, ncon, nrole);
223 		}catch (IOException e){
224 			throw e;
225 		}
226 	}
227 
228 	@Override protected URI getPosURI(String textualPOS) {
229         return LEXINFO.VERB;
230     }
231 
232 	@Override public String getArgLabel() {
233         return "";
234     }
235 
236 	private String removeNameSpace(String str){																			// Remove the namespace vn:mind => mind
237 		String strNoNS = str.substring(3, str.length());
238 		if(strNoNS.compareToIgnoreCase("NULL") == 0){
239 			return null;
240 		}else{
241 			return strNoNS;
242 		}
243 	}
244 
245 	private String getVnID(String vnSc){																				// 37.11-1 => lecture-37.11-1
246 		String vnID = vnMap.get(vnSc);
247 
248 		if (vnID == null) {
249 			LOGGER.error("VerbNet ID {} not found", vnSc);
250 			vnID = null;
251 		}else{
252 			vnID = vnID + "-" + vnSc;
253 		}
254 
255 		return vnID;
256 	}
257 
258 	private URI uriForWnSense(String wnSense, String uriLemma){															// --> Get the URI for WordNet
259 
260 		URI wnConceptualizationURI = null;
261 
262 		if (wnSense != null && this.wnInfo.size() > 0) {
263 			final String[] wns = wnSense.split("\\s+");
264 
265 			for (String wn : wns) {
266 
267 				wn = wn.trim();
268 
269 				if (wn.length() == 0) {
270 					continue;
271 				}
272 
273 				/*boolean questionMark = false;
274 				if (wn.startsWith("?")) {
275 					//LOGGER.warn("The wn {} starts with ?", wn);
276 					questionMark = true;
277 					wn = wn.substring(1);
278 				}*/
279 
280 				final URI wnURI = this.wnInfo.get(wn);
281 
282 				if (wnURI == null) {
283 					LOGGER.warn("No wnURI found for {}", wn);
284 					continue;
285 				}
286 
287 				String lemma = wn.substring(0, wn.indexOf('%'));
288 				final URI reference = this.wnInfo.get(wnURI.toString() + "|" + lemma);
289 
290 				if (reference == null) {
291 					LOGGER.warn("No reference found for {} / {}", wnURI.toString(), lemma);
292 					continue;
293 				}
294 
295 				final Matcher m = WN_PATTERN.matcher(reference.toString());
296 				if (!m.find()) {
297 					continue;
298 				}
299 
300 				String wnuri = wnURI.toString();
301 				URI type = null;
302 				if(wnuri.endsWith("-v")){
303 					type = LEXINFO.VERB;
304 				}else if(wnuri.endsWith("-n")){
305 					type = LEXINFO.NOUN;
306 				}else if(wnuri.endsWith("-r")){
307 					type = LEXINFO.ADVERB;
308 				}else if(wnuri.endsWith("-a")){
309 					type = LEXINFO.ADJECTIVE;
310 				}
311 
312 				URI lexicalEntryURI = uriForLexicalEntry(lemma, type);
313 
314 				wnConceptualizationURI = uriForConceptualizationWithPrefix(uriLemma,
315 						"v", wnURI.toString().replace(WN_NAMESPACE,""), "wn31");
316 
317 				addStatementToSink(wnConceptualizationURI, RDF.TYPE, PMO.CONCEPTUALIZATION);
318 				addStatementToSink(wnConceptualizationURI, PMO.EVOKING_ENTRY, lexicalEntryURI);
319 				addStatementToSink(wnConceptualizationURI, RDFS.SEEALSO, reference);
320 				addStatementToSink(wnConceptualizationURI, PMO.EVOKED_CONCEPT, wnURI);
321 			}
322 		}
323 		return wnConceptualizationURI;
324 	}																													// Get the URI for WordNet -->
325 
326 }