Skip to content
Snippets Groups Projects
Commit 99deb744 authored by Cribellier, Antoine's avatar Cribellier, Antoine
Browse files

Added function to get unmatched_recordings + associated test functions

parent d097db0d
No related branches found
No related tags found
1 merge request!7Improved architecture, documentation and flexibility, added working tests for preprocessing of images
import os, re
from typing import List
from difflib import SequenceMatcher
from datetime import datetime
import numpy as np
__author__ = "C.J. Voesenek and A. Cribellier"
__maintainer__ = "A. Cribellier"
__email__ = "antoine.cribellier@wur.nl"
def match_folders_by_name(folder_names: List[str],
threshold_ratio_diff: float = None,
threshold_nb_diff_char: int = None) -> List[List[str]]:
"""
Matches list of folder names together by checking how similar their folder names are and associate them together
The best matches will be found by comparing to the smaller list of folders to avoid matching one to two folders.
Args:
folder_names: List[list[]] of folder names
threshold_ratio_diff: Will check the similarity percentage (between 0 and 1)
threshold_nb_diff_char: Will check how many characters are different between the file names.
Returns:
matched_directories: List of lists (# of recordings * # of cameras) containing the directory paths of the matched recordings for all cameras.
Raises:
ValueError: If both threshold type are given instead of only one
TypeError: thresholds have to be either a positive int or a float between 0.0 and 1.0.
"""
if threshold_ratio_diff is not None and threshold_nb_diff_char is not None:
raise ValueError("Only one threshold has to be procured")
elif threshold_ratio_diff is None and threshold_nb_diff_char is None:
threshold_ratio_diff = 1.0
if threshold_ratio_diff is not None:
assert type(threshold_ratio_diff) is float
assert 0.0 <= threshold_ratio_diff <= 1.0
elif threshold_nb_diff_char is not None:
if type(threshold_nb_diff_char) is float and threshold_nb_diff_char.is_integer():
threshold_nb_diff_char = int(threshold_nb_diff_char)
assert threshold_nb_diff_char >= 0
else:
raise TypeError('Threshold has to be either a positive int or a float between 0.0 and 1.0')
nb_directories = len(folder_names)
nb_folders = [len(x) for x in folder_names]
i_min = np.argmin(nb_folders)
matched_directories, ratios = [[]] * len(folder_names), [[]] * len(folder_names)
is_similar_enough = [[]] * len(folder_names)
for i in range(nb_directories):
if i == i_min:
matched_directories[i] = folder_names[i_min]
ratios[i] = [1.0] * len(folder_names[i_min])
else:
matches = [max([(SequenceMatcher(a=f1, b=f2).ratio(), f2) for f2 in folder_names[i]]) for f1 in folder_names[i_min]]
matched_directories[i] = [folder_name for ratio, folder_name in matches]
ratios[i] = [ratio for ratio, folder_name in matches]
if threshold_ratio_diff is not None:
is_similar_enough[i] = [ratio >= threshold_ratio_diff for ratio in ratios[i]]
elif threshold_nb_diff_char is not None:
nb_char_diffs = [round((1-ratio)*len(folder_names[i_min][j])) for j, ratio in enumerate(ratios[i])]
is_similar_enough[i] = [nb_char_diff <= threshold_nb_diff_char for nb_char_diff in nb_char_diffs]
matched_directories = [list(x) for x in zip(*matched_directories)]
is_similar_enough = [list(x) for x in zip(*is_similar_enough)]
matched_directories = [x for i, x in enumerate(matched_directories) if all(is_similar_enough[i])]
return matched_directories
def match_recordings_by_name(directories: List[str],
threshold_ratio_diff: float = None,
threshold_nb_diff_char: int = None) -> List[List[str]]:
"""
Matches the recordings together (minimum 2) by checking how similar their folder names are and associate them together
The best matches will be found by comparing to the smaller list of folders to avoid matching one to two other folders.
Args:
directories: List of directory paths (one per camera) each containing folders (to be matched) with recorded images
threshold_ratio_diff: Will check the similarity percentage (between 0 and 1)
threshold_nb_diff_char: Will check how many characters are different between the file names.
Returns:
matched_directories: List of lists (# of recordings * # of cameras) containing the directory paths of the matched recordings for all cameras.
"""
folder_names, nb_folders = [[]] * len(directories), [[]] * len(directories)
for i, directory in enumerate(directories):
folders = os.listdir(directory)
folder_names[i] = [os.path.basename(os.path.normpath(folder)) for folder in folders]
if threshold_nb_diff_char is not None:
return match_folders_by_name(folder_names, threshold_nb_diff_char=threshold_nb_diff_char)
else:
return match_folders_by_name(folder_names, threshold_ratio_diff=threshold_ratio_diff)
def match_folders_by_date(folder_names: List[str],
expr: str = r"\d{8}_\d{6}",
format_date_str: str ='%Y%m%d_%H%M%S',
threshold_s: int = 30) -> List[List[str]]:
"""
Matches list of folder names together by checking how similar their folder names are and associate them together
The best matches will be found by comparing to the smaller list of folders to avoid matching one to two folders.
Args:
folder_names: List[list[]] of folder names
expr: Regular expression pattern matching recording folders.
format_date_str: format of the date and time in the folder names.
threshold_s: Will check if the date in the folder names differ from less than threshold in second
Returns:
matched_directories: List of lists (# of recordings * # of cameras) containing the directory paths of the matched recordings for all cameras.
Raises:
TypeError: threshold_s has to be a positive int or a float between 0.0 and 1.0.
"""
if type(threshold_s) is float and threshold_s.is_integer():
threshold_s = int(threshold_s)
if not type(threshold_s) is int and threshold_s >= 0:
raise TypeError('Threshold has to be either a positive int or a float between 0.0 and 1.0')
pattern = re.compile(expr)
dates = [[]] * len(folder_names)
for i in range(len(folder_names)):
new_folder_names, new_dates = [], []
for j in range(len(folder_names[i])):
match = re.search(pattern, folder_names[i][j])
if match:
new_folder_names.append(folder_names[i][j])
new_dates.append(datetime.strptime(match.group(), format_date_str))
folder_names[i] = new_folder_names
dates[i] = new_dates
nb_folders = [len(x) for x in folder_names]
i_min = np.argmin(nb_folders)
matched_directories, diff_time_s = [[]] * len(folder_names), [[]] * len(folder_names)
is_similar_enough = [[]] * len(folder_names)
for i in range(len(folder_names)):
if i == i_min:
matched_directories[i] = folder_names[i_min]
diff_time_s[i] = [0] * len(folder_names[i_min])
else:
matches = [min([(abs((d1 - d2).total_seconds()), folder_names[i][j])
for j, d2 in enumerate(dates[i])]) for d1 in dates[i_min]]
matched_directories[i] = [folder_name for diff_time_s, folder_name in matches]
diff_time_s[i] = [diff_s for diff_s, folder_name in matches]
is_similar_enough[i] = [x <= threshold_s for x in diff_time_s[i]]
matched_directories = [list(x) for x in zip(*matched_directories)]
is_similar_enough = [list(x) for x in zip(*is_similar_enough)]
matched_directories = [x for i, x in enumerate(matched_directories) if all(is_similar_enough[i])]
return matched_directories
def match_recordings_by_date(directories: List[str],
expr: str = r"\d{8}_\d{6}",
format_date_str: str ='%Y%m%d_%H%M%S',
threshold_s: int = 30) -> List[List[str]]:
"""
Matches the recordings together (minimum 2) by checking how similar the date in their folder names are and associate them together
The best matches will be found by comparing to the smaller list of folders to avoid matching one to two other folders.
Args:
directories: List of directory paths (one per camera) each containing folders (to be matched) with recorded images
expr: Regular expression pattern matching recording folders.
format_date_str: format of the date and time in the folder names.
threshold_s: Will check if the date in the folder names differ from less than threshold in second
Returns:
matched_directories: List of lists (# of recordings * # of cameras) containing the directory paths of the matched recordings for all cameras.
"""
folder_names, nb_folders = [[]] * len(directories), [[]] * len(directories)
for i, directory in enumerate(directories):
folders = os.listdir(directory)
folder_names[i] = [os.path.basename(os.path.normpath(folder)) for folder in folders]
return match_folders_by_date(folder_names, expr, format_date_str, threshold_s)
def get_unmatched_recordings(directories: List[str], matched_directories: List[str]) -> List[str]:
unmatched_directories = [[]] * len(directories)
return unmatched_directories
def move_recordings_to_folder(sources: List[str], destinations: List[str]) -> None:
unmatched_directories = 0
\ No newline at end of file
from images import process, recordings
from images import process, utils_recordings
def test_match_folders() -> None:
file_names = [['cam1_20220304_055123', 'cam1_20220304_101111', 'cam1_20220304_110140', 'cam1_20220304_120352'],
folder_names = [['cam1_20220304_055123', 'cam1_20220304_101111', 'cam1_20220304_110140', 'cam1_20220304_120352'],
['cam2_20220304_055123', 'cam2_20220304_101115', 'cam2_20220304_120402'],
['cam3_20220304_055123', 'cam3_20220304_101111', 'cam3_20220304_111158', 'cam3_20220304_120352']]
matched_directories = recordings.match_folders_by_name(file_names)
matched_directories = recordings.match_folders_by_name(folder_names)
assert matched_directories == []
matched_directories = recordings.match_folders_by_name(file_names, threshold_nb_diff_char=1)
matched_directories = recordings.match_folders_by_name(folder_names, threshold_nb_diff_char=1)
assert matched_directories == [['cam1_20220304_055123', 'cam2_20220304_055123', 'cam3_20220304_055123']]
matched_directories = recordings.match_folders_by_date(file_names, threshold_s=60)
matched_directories = recordings.match_folders_by_date(folder_names, threshold_s=60)
assert matched_directories == [['cam1_20220304_055123', 'cam2_20220304_055123', 'cam3_20220304_055123'],
['cam1_20220304_101111', 'cam2_20220304_101115', 'cam3_20220304_101111'],
['cam1_20220304_120352', 'cam2_20220304_120402', 'cam3_20220304_120352']]
......@@ -27,6 +27,20 @@ def test_match_recordings() -> None:
assert len(matched_directories) == 2
def test_get_unmatched_folder_names() -> None:
folder_names = [['cam1_20220304_055123', 'cam1_20220304_101111', 'cam1_20220304_110140', 'cam1_20220304_120352'],
['cam2_20220304_055123', 'cam2_20220304_101115', 'cam2_20220304_120402'],
['cam3_20220304_055123', 'cam3_20220304_101111', 'cam3_20220304_111158', 'cam3_20220304_120352']]
matched_folder_names = [['cam1_20220304_055123', 'cam2_20220304_055123', 'cam3_20220304_055123'],
['cam1_20220304_101111', 'cam2_20220304_101115', 'cam3_20220304_101111'],
['cam1_20220304_120352', 'cam2_20220304_120402', 'cam3_20220304_120352']]
unmatched_folder_names = recordings.get_unmatched_folders(folder_names, matched_folder_names)
assert unmatched_folder_names == [['cam1_20220304_110140'], [], ['cam3_20220304_111158',]]
def test_process() -> None:
# TODO! Make test_process
print('TODO')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment