CaseBasedReasoning/similarity.py

83 lines
2.2 KiB
Python
Raw Normal View History

2022-12-13 13:25:31 +00:00
import math
from typing import Any, Callable
"""
METRIC SIMILARITY FUNCTIONS
"""
def manhattan_sim(q_val: float, c_val: float) -> float:
m_dist = lambda x, y: abs(x - y)
return 1 / (1 + m_dist(q_val, c_val))
def euclid_sim(q_val: float, c_val: float) -> float:
e_dist = lambda x, y: math.sqrt((x - y)**2)
return 1 / (1 + e_dist(q_val, c_val))
METRIC_SIMS = [manhattan_sim, euclid_sim]
"""
SYMBOLIC SIMILARITY
"""
def symbolic_sim(q_field_name: str, c_field_name: str, sim_matrix: dict) -> float:
return sim_matrix[q_field_name][c_field_name]
SYMBOLIC_SIMS = [symbolic_sim]
"""
CHARACTER EDIT DISTANCE
"""
def edit_distance(word_1: str, word_2: str, to_same_case: bool = True) -> int:
if word_1 == word_2:
return 0
if to_same_case:
word_1, word_2 = [word.upper() for word in (word_1, word_2)]
word_1, word_2 = list(word_1), list(word_2)
longer_word = word_1 if len(word_1) > len(word_2) else word_2
i, count = 0, 0
while i < len(longer_word):
# word_2 is longer -> add current char of word_2
if i >= len(word_1):
word_1.append(word_2[i])
count += 1
#continue
# word_1 is longer -> remove current char of word_1
if i >= len(word_2):
word_1.pop(i)
count += 1
continue
# same char -> skip word
if word_1[i] == word_2[i]:
i += 1
continue
# not in the beginning or the end
if i > 0 and i < len(word_1):
2023-01-10 11:50:24 +00:00
"""
previous char is same and current char of word_1 is same as next char of word_2
-> fill current char of word_2 between last and next char of word_1
e.g. word_1[i-1] = "M" ; word_1[i] = "R"
word_2[i-1] = "M" ; word_1[i] = "A" ; word_2[i+1] = "R"
"""
2022-12-13 13:25:31 +00:00
if word_1[i-1] == word_2[i-1] and word_1[i] == word_2[i+1]:
word_1.insert(i, word_2[i])
count += 1
i += 1
continue
if word_1[i] != word_2[i]:
word_1.pop(i)
count += 1
continue
return "".join(word_1), count
STRING_SIMS = [edit_distance]