Distance d’édition (4 octobre)¶
distance avec des mots de même longueur¶
[1]:
def distance(m1, m2):
if len(m1) != len(m2):
raise ValueError("La distance ne marche que pour des mots de même longueur.")
d = 0
for i in range(len(m1)):
if m1[i] != m2[i]:
d += 1
return d
distance("REMUNERER", "RENUMERER")
[1]:
2
Distance avec des mots presque de même longueur¶
[2]:
def distance_1(m1, m2):
if len(m1) == len(m2):
return distance(m1, m2)
if len(m1) > len(m2):
return distance_1(m2, m1)
if len(m1) + 1 != len(m2):
raise ValueError("La distance ne marche que pour len(m1) + 1 == len(m2).")
best = None
for i in range(len(m1) + 1):
m3 = m1[:i] + " " + m1[i:]
d = distance(m3, m2)
if best is None or d < best:
best = d
return best
def test_distance_1(fdist):
assert fdist("REMUNNERER", "RENUMERER") == 3
assert fdist("", "A") == 1
assert fdist("A", "") == 1
assert fdist("A", "A") == 0
assert fdist("", "") == 0
test_distance_1(distance_1)
Cas générique¶
[3]:
def distance_n(m1, m2):
if len(m1) > len(m2):
return distance_n(m2, m1)
if len(m1) + 1 == len(m2):
return distance_1(m1, m2)
best = None
for i in range(len(m1) + 1):
m3 = m1[:i] + " " + m1[i:]
d = distance_n(m3, m2)
if best is None or d < best:
best = d
return best
test_distance_1(distance_n)
[4]:
def test_distance_n(fdist):
assert fdist("", "AA") == 2
assert fdist("ii", "iji") == 1
test_distance_n(distance_n)
Coût algorithmique¶
La fonction effectue une boucle de (P+1) itération, puis une autre de (P+2) puis une autre jusqu’à Q. Le coût est donc en \(O\left(\frac{Q!}{P!} \right)\).
Distance de Levenshtein¶
Voir Distance de Levenshtein.
[5]:
import numpy as np
def distance_levenstein(m1, m2):
dist = np.zeros((len(m1) + 1, len(m2) + 1))
for i in range(len(m1) + 1):
dist[i, 0] = i
for j in range(len(m2) + 1):
dist[0, j] = j
for i in range(1, len(m1) + 1):
for j in range(1, len(m2) + 1):
c = 1 if m1[i - 1] != m2[j - 1] else 0
dist[i, j] = min(
[dist[i - 1, j] + 1, dist[i, j - 1] + 1, dist[i - 1, j - 1] + c]
)
return dist[len(m1), len(m2)]
distance_levenstein("REMUNERER", "RENUMERER")
test_distance_1(distance_levenstein)
test_distance_n(distance_levenstein)
Avec alignement¶
[8]:
import numpy as np
def distance_levenstein_alignment(m1, m2):
dist = np.zeros((len(m1) + 1, len(m2) + 1))
pred = np.zeros((len(m1) + 1, len(m2) + 1))
for i in range(len(m1) + 1):
dist[i, 0] = i
pred[i, 0] = 1
for j in range(len(m2) + 1):
dist[0, j] = j
pred[0, j] = 2
for i in range(1, len(m1) + 1):
for j in range(1, len(m2) + 1):
a = dist[i - 1, j] + 1
b = dist[i, j - 1] + 1
c = dist[i - 1, j - 1] + (1 if m1[i - 1] != m2[j - 1] else 0)
if c <= min(a, b):
dist[i, j] = c
pred[i, j] = 3
elif a <= b:
dist[i, j] = a
pred[i, j] = 1
else:
dist[i, j] = b
pred[i, j] = 2
# que faire avec pred?
i, j = len(m1), len(m2)
positions = []
chars = []
while i != 0 or j != 0:
positions.append((i, j))
p = pred[i, j]
if p == 1:
chars.append((m1[i - 1], " "))
i -= 1
elif p == 2:
chars.append((" ", m2[j - 1]))
j -= 1
else:
chars.append((m1[i - 1], m2[j - 1]))
i -= 1
j -= 1
positions.reverse()
chars.reverse()
return (
dist[len(m1), len(m2)],
positions,
"".join(c[0] for c in chars),
"".join(c[1] for c in chars),
)
distance_levenstein_alignment("REMUNEREERE", "RENNNUMERER")
[8]:
(6.0,
[(1, 1),
(2, 2),
(2, 3),
(2, 4),
(3, 5),
(4, 6),
(5, 7),
(6, 8),
(7, 9),
(8, 9),
(9, 10),
(10, 11),
(11, 11)],
'RE MUNEREERE',
'RENNNUMER ER ')
Un coût différent pour les accents¶
[10]:
def cost(c1, c2):
if c1 == c2:
return 0
if c1 in "eéêèë" and c2 in "eéêèë":
return 0.5
return 1
def distance_levenstein_alignment_accent(m1, m2):
dist = np.zeros((len(m1) + 1, len(m2) + 1))
pred = np.zeros((len(m1) + 1, len(m2) + 1))
for i in range(len(m1) + 1):
dist[i, 0] = i
pred[i, 0] = 1
for j in range(len(m2) + 1):
dist[0, j] = j
pred[0, j] = 2
for i in range(1, len(m1) + 1):
for j in range(1, len(m2) + 1):
a = dist[i - 1, j] + 1
b = dist[i, j - 1] + 1
c = dist[i - 1, j - 1] + cost(m1[i - 1], m2[j - 1])
if c <= min(a, b):
dist[i, j] = c
pred[i, j] = 3
elif a <= b:
dist[i, j] = a
pred[i, j] = 1
else:
dist[i, j] = b
pred[i, j] = 2
i, j = len(m1), len(m2)
positions = []
chars = []
while i != 0 or j != 0:
positions.append((i, j))
p = pred[i, j]
if p == 1:
chars.append((" ", m2[j - 1]))
i -= 1
elif p == 2:
chars.append((m1[i - 1], " "))
j -= 1
else:
chars.append((m1[i - 1], m2[j - 1]))
i -= 1
j -= 1
positions.reverse()
chars.reverse()
return (
dist[len(m1), len(m2)],
positions,
"".join(c[0] for c in chars),
"".join(c[1] for c in chars),
)
distance_levenstein_alignment_accent("rémunnérer", "renumérer")
[10]:
(3.5,
[(1, 1),
(2, 2),
(3, 3),
(4, 4),
(5, 4),
(6, 5),
(7, 6),
(8, 7),
(9, 8),
(10, 9)],
'rému nérer',
'renuumérer')