diff --git a/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_Matrices.java b/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_Matrices.java index c2cf18b2120081e7e3f180428f78d70f25f613b4..50795268af35feb28c1d609ce35045150b60835f 100644 --- a/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_Matrices.java +++ b/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_Matrices.java @@ -1,11 +1,9 @@ package nl.fairbydesign.backend.ncbi; import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.*; +import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.junit.Test; @@ -17,11 +15,10 @@ import java.util.*; public class NCBI_Matrices { static String startTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm-ss")); - static String targetDir = "./json/all"; @Test public void attributeAbundanceMatrix() throws IOException { - File root = new File(targetDir+"/fetch"); + File root = new File("./json/all/fetch"); // Map<projectID, Set[attributes]> Map<String, Set<String>> attributeMap = new HashMap<>(); Set<String> attributes = new HashSet<>(); @@ -56,11 +53,11 @@ public class NCBI_Matrices { } } System.out.println("processed "+counter+" files in total"); - writeAttributes(attributes); + AAMWriteAttributes(attributes); AAMtoCSV(attributes, attributeMap); } - public Set<String> getAllNodeKeys(String json) throws IOException { + private Set<String> getAllNodeKeys(String json) throws IOException { ObjectMapper objectMapper = new ObjectMapper(); objectMapper.configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true); objectMapper.configure(MapperFeature.ACCEPT_CASE_INSENSITIVE_PROPERTIES, true); @@ -97,7 +94,7 @@ public class NCBI_Matrices { * @param attributeMap A HashMap of HashSets with projectID as keys */ private void AAMtoCSV(Set<String> sampleAttributes, Map<String, Set<String>> attributeMap) { - String folder = targetDir+"/parsings/"; + String folder = "./json/all/parsings/"; boolean isFolderCreated = new File(folder).exists() || new File(folder).mkdirs(); File file = new File(folder+"AAM"+startTime+".csv"); @@ -137,7 +134,7 @@ public class NCBI_Matrices { /** * @param sampleAttributes A HashSet of possible attributes */ - private void writeAttributes (Set<String> sampleAttributes) { + private void AAMWriteAttributes(Set<String> sampleAttributes) { String folder = "./nlp/Input/"; boolean isFolderCreated = new File(folder).exists() || new File(folder).mkdirs(); File file = new File(folder+"AllAttributes_Raw.txt"); @@ -157,4 +154,17 @@ public class NCBI_Matrices { } } + @Test + public void similarityMatrix() { + File inputFile = new File("./nlp/Output/AllAttributes_Cleaned.txt"); + try (Scanner scanner = new Scanner(inputFile)) { + scanner.nextLine(); + while (scanner.hasNextLine()) { + System.out.println(scanner.nextLine().split(",")[1]); + } + } catch (FileNotFoundException e) { + System.out.println("Could not find file: " + inputFile); + } + } + } diff --git a/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_NLP.java b/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_NLP.java index abf245fe8af97ea11fc9f98971e84fe9a27abe59..bce6c861df9fc92ea012b32dd5b1753ee99e8789 100644 --- a/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_NLP.java +++ b/src/main/java/nl/fairbydesign/backend/ncbi/NCBI_NLP.java @@ -9,6 +9,8 @@ import org.junit.Test; import java.io.*; import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -19,7 +21,7 @@ public class NCBI_NLP { public void cleanInput() { File file = new File("./nlp/Input/AllAttributes_Raw.txt"); ArrayList<String> list = new ArrayList<>(); - Pattern pattern = Pattern.compile("[^a-zA-Z\\d\\s]|\\s\\S\\s|\\s\\S$|^\\S\\s|\\s+"); + Pattern pattern = Pattern.compile("[^a-zA-Z\\d\\s]|\\s+"); try (Scanner scanner = new Scanner(file)) { while (scanner.hasNextLine()) { String attribute = scanner.nextLine(); @@ -30,13 +32,13 @@ public class NCBI_NLP { cleanedAtt = cleanedAtt.replaceAll(pattern.toString(), " "); i = matcher.start() + 1; } - list.add(attribute +" --> "+ cleanedAtt.trim()); + list.add(attribute +","+ cleanedAtt.trim()); } } catch (FileNotFoundException e) { System.out.println("Could not find file: " + file); } - File outputFile = new File("./nlp/Output/AllAttributes_Processed.txt"); + File outputFile = new File("./nlp/Output/AllAttributes_Cleaned.csv"); try { FileUtils.writeLines(outputFile, list, false); } catch (IOException e) { @@ -47,6 +49,45 @@ public class NCBI_NLP { } } + @Test + public void levenshteinMatrix() { + File inputFile = new File("./nlp/Output/AllAttributes_Cleaned.csv"); + ArrayList<String> cleanedAttributes = new ArrayList<>(); + try (Scanner scanner = new Scanner(inputFile)) { + while (scanner.hasNextLine()) { + cleanedAttributes.add(scanner.nextLine().split(",")[1]); + } + } catch (FileNotFoundException e) { + System.out.println("Could not find file: " + inputFile); + } + + File outputFile = new File("./nlp/Output/LevenshteinDistance.csv"); + int attrCount = cleanedAttributes.size(); + try (FileWriter fw = new FileWriter(outputFile); + BufferedWriter bw = new BufferedWriter(fw); + PrintWriter out = new PrintWriter(bw)) + { + out.println(","+ cleanedAttributes); + for (int i = 0; i < attrCount; i++) { + String attr1 = cleanedAttributes.get(i); + StringBuilder row = new StringBuilder(attr1); + System.out.println("Processing all distances (" + (attrCount - i) + ") for: " + attr1); + for (int j = 0; j < attrCount; j++) { + if (j < i) { + row.append(","); + } else { + String attr2 = cleanedAttributes.get(j); + int distance = LevenshteinDistance.compute_Levenshtein_distance(attr1, attr2); + row.append(",").append(distance); + } + } + out.println(row); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + @Test public void testLemmatizer() throws IOException { String[] tokens = new String[]{"Most", "large", "cities", "in", "the", "US", "had",