Skip to content
Snippets Groups Projects
Commit 3f201806 authored by Martijn Landman's avatar Martijn Landman
Browse files

Added function to make a Levenshtein distance matrix

parent 6c8a27b7
No related tags found
No related merge requests found
package nl.fairbydesign.backend.ncbi;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.Test;
......@@ -17,11 +15,10 @@ import java.util.*;
public class NCBI_Matrices {
static String startTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss"));
static String targetDir = "./json/all";
@Test
public void attributeAbundanceMatrix() throws IOException {
File root = new File(targetDir+"/fetch");
File root = new File("./json/all/fetch");
// Map<projectID, Set[attributes]>
Map<String, Set<String>> attributeMap = new HashMap<>();
Set<String> attributes = new HashSet<>();
......@@ -56,11 +53,11 @@ public class NCBI_Matrices {
}
}
System.out.println("processed "+counter+" files in total");
writeAttributes(attributes);
AAMWriteAttributes(attributes);
AAMtoCSV(attributes, attributeMap);
}
public Set<String> getAllNodeKeys(String json) throws IOException {
private Set<String> getAllNodeKeys(String json) throws IOException {
ObjectMapper objectMapper = new ObjectMapper();
objectMapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true);
objectMapper.configure(MapperFeature.ACCEPT_CASE_INSENSITIVE_PROPERTIES, true);
......@@ -97,7 +94,7 @@ public class NCBI_Matrices {
* @param attributeMap A HashMap of HashSets with projectID as keys
*/
private void AAMtoCSV(Set<String> sampleAttributes, Map<String, Set<String>> attributeMap) {
String folder = targetDir+"/parsings/";
String folder = "./json/all/parsings/";
boolean isFolderCreated = new File(folder).exists() || new File(folder).mkdirs();
File file = new File(folder+"AAM"+startTime+".csv");
......@@ -137,7 +134,7 @@ public class NCBI_Matrices {
/**
* @param sampleAttributes A HashSet of possible attributes
*/
private void writeAttributes (Set<String> sampleAttributes) {
private void AAMWriteAttributes(Set<String> sampleAttributes) {
String folder = "./nlp/Input/";
boolean isFolderCreated = new File(folder).exists() || new File(folder).mkdirs();
File file = new File(folder+"AllAttributes_Raw.txt");
......@@ -157,4 +154,17 @@ public class NCBI_Matrices {
}
}
@Test
public void similarityMatrix() {
File inputFile = new File("./nlp/Output/AllAttributes_Cleaned.txt");
try (Scanner scanner = new Scanner(inputFile)) {
scanner.nextLine();
while (scanner.hasNextLine()) {
System.out.println(scanner.nextLine().split(",")[1]);
}
} catch (FileNotFoundException e) {
System.out.println("Could not find file: " + inputFile);
}
}
}
......@@ -9,6 +9,8 @@ import org.junit.Test;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
......@@ -19,7 +21,7 @@ public class NCBI_NLP {
public void cleanInput() {
File file = new File("./nlp/Input/AllAttributes_Raw.txt");
ArrayList<String> list = new ArrayList<>();
Pattern pattern = Pattern.compile("[^a-zA-Z\\d\\s]|\\s\\S\\s|\\s\\S$|^\\S\\s|\\s+");
Pattern pattern = Pattern.compile("[^a-zA-Z\\d\\s]|\\s+");
try (Scanner scanner = new Scanner(file)) {
while (scanner.hasNextLine()) {
String attribute = scanner.nextLine();
......@@ -30,13 +32,13 @@ public class NCBI_NLP {
cleanedAtt = cleanedAtt.replaceAll(pattern.toString(), " ");
i = matcher.start() + 1;
}
list.add(attribute +" --> "+ cleanedAtt.trim());
list.add(attribute +","+ cleanedAtt.trim());
}
} catch (FileNotFoundException e) {
System.out.println("Could not find file: " + file);
}
File outputFile = new File("./nlp/Output/AllAttributes_Processed.txt");
File outputFile = new File("./nlp/Output/AllAttributes_Cleaned.csv");
try {
FileUtils.writeLines(outputFile, list, false);
} catch (IOException e) {
......@@ -47,6 +49,45 @@ public class NCBI_NLP {
}
}
@Test
public void levenshteinMatrix() {
File inputFile = new File("./nlp/Output/AllAttributes_Cleaned.csv");
ArrayList<String> cleanedAttributes = new ArrayList<>();
try (Scanner scanner = new Scanner(inputFile)) {
while (scanner.hasNextLine()) {
cleanedAttributes.add(scanner.nextLine().split(",")[1]);
}
} catch (FileNotFoundException e) {
System.out.println("Could not find file: " + inputFile);
}
File outputFile = new File("./nlp/Output/LevenshteinDistance.csv");
int attrCount = cleanedAttributes.size();
try (FileWriter fw = new FileWriter(outputFile);
BufferedWriter bw = new BufferedWriter(fw);
PrintWriter out = new PrintWriter(bw))
{
out.println(","+ cleanedAttributes);
for (int i = 0; i < attrCount; i++) {
String attr1 = cleanedAttributes.get(i);
StringBuilder row = new StringBuilder(attr1);
System.out.println("Processing all distances (" + (attrCount - i) + ") for: " + attr1);
for (int j = 0; j < attrCount; j++) {
if (j < i) {
row.append(",");
} else {
String attr2 = cleanedAttributes.get(j);
int distance = LevenshteinDistance.compute_Levenshtein_distance(attr1, attr2);
row.append(",").append(distance);
}
}
out.println(row);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Test
public void testLemmatizer() throws IOException {
String[] tokens = new String[]{"Most", "large", "cities", "in", "the", "US", "had",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment