Distance d’édition (4 octobre)

distance avec des mots de même longueur

[1]:
def distance(m1, m2):
    if len(m1) != len(m2):
        raise ValueError("La distance ne marche que pour des mots de même longueur.")
    d = 0
    for i in range(len(m1)):
        if m1[i] != m2[i]:
            d += 1
    return d


distance("REMUNERER", "RENUMERER")
[1]:
2

Distance avec des mots presque de même longueur

[2]:
def distance_1(m1, m2):
    if len(m1) == len(m2):
        return distance(m1, m2)
    if len(m1) > len(m2):
        return distance_1(m2, m1)
    if len(m1) + 1 != len(m2):
        raise ValueError("La distance ne marche que pour len(m1) + 1 == len(m2).")
    best = None
    for i in range(len(m1) + 1):
        m3 = m1[:i] + " " + m1[i:]
        d = distance(m3, m2)
        if best is None or d < best:
            best = d
    return best


def test_distance_1(fdist):
    assert fdist("REMUNNERER", "RENUMERER") == 3
    assert fdist("", "A") == 1
    assert fdist("A", "") == 1
    assert fdist("A", "A") == 0
    assert fdist("", "") == 0


test_distance_1(distance_1)

Cas générique

[3]:
def distance_n(m1, m2):
    if len(m1) > len(m2):
        return distance_n(m2, m1)
    if len(m1) + 1 == len(m2):
        return distance_1(m1, m2)
    best = None
    for i in range(len(m1) + 1):
        m3 = m1[:i] + " " + m1[i:]
        d = distance_n(m3, m2)
        if best is None or d < best:
            best = d
    return best


test_distance_1(distance_n)
[4]:
def test_distance_n(fdist):
    assert fdist("", "AA") == 2
    assert fdist("ii", "iji") == 1


test_distance_n(distance_n)

Coût algorithmique

La fonction effectue une boucle de (P+1) itération, puis une autre de (P+2) puis une autre jusqu’à Q. Le coût est donc en \(O\left(\frac{Q!}{P!} \right)\).

Distance de Levenshtein

Voir Distance de Levenshtein.

[5]:
import numpy as np


def distance_levenstein(m1, m2):
    dist = np.zeros((len(m1) + 1, len(m2) + 1))
    for i in range(len(m1) + 1):
        dist[i, 0] = i
    for j in range(len(m2) + 1):
        dist[0, j] = j
    for i in range(1, len(m1) + 1):
        for j in range(1, len(m2) + 1):
            c = 1 if m1[i - 1] != m2[j - 1] else 0
            dist[i, j] = min(
                [dist[i - 1, j] + 1, dist[i, j - 1] + 1, dist[i - 1, j - 1] + c]
            )
    return dist[len(m1), len(m2)]


distance_levenstein("REMUNERER", "RENUMERER")

test_distance_1(distance_levenstein)
test_distance_n(distance_levenstein)

Avec alignement

[8]:
import numpy as np


def distance_levenstein_alignment(m1, m2):
    dist = np.zeros((len(m1) + 1, len(m2) + 1))
    pred = np.zeros((len(m1) + 1, len(m2) + 1))
    for i in range(len(m1) + 1):
        dist[i, 0] = i
        pred[i, 0] = 1
    for j in range(len(m2) + 1):
        dist[0, j] = j
        pred[0, j] = 2
    for i in range(1, len(m1) + 1):
        for j in range(1, len(m2) + 1):
            a = dist[i - 1, j] + 1
            b = dist[i, j - 1] + 1
            c = dist[i - 1, j - 1] + (1 if m1[i - 1] != m2[j - 1] else 0)
            if c <= min(a, b):
                dist[i, j] = c
                pred[i, j] = 3
            elif a <= b:
                dist[i, j] = a
                pred[i, j] = 1
            else:
                dist[i, j] = b
                pred[i, j] = 2
    # que faire avec pred?
    i, j = len(m1), len(m2)
    positions = []
    chars = []
    while i != 0 or j != 0:
        positions.append((i, j))
        p = pred[i, j]
        if p == 1:
            chars.append((m1[i - 1], " "))
            i -= 1
        elif p == 2:
            chars.append((" ", m2[j - 1]))
            j -= 1
        else:
            chars.append((m1[i - 1], m2[j - 1]))
            i -= 1
            j -= 1

    positions.reverse()
    chars.reverse()
    return (
        dist[len(m1), len(m2)],
        positions,
        "".join(c[0] for c in chars),
        "".join(c[1] for c in chars),
    )


distance_levenstein_alignment("REMUNEREERE", "RENNNUMERER")
[8]:
(6.0,
 [(1, 1),
  (2, 2),
  (2, 3),
  (2, 4),
  (3, 5),
  (4, 6),
  (5, 7),
  (6, 8),
  (7, 9),
  (8, 9),
  (9, 10),
  (10, 11),
  (11, 11)],
 'RE  MUNEREERE',
 'RENNNUMER ER ')

Un coût différent pour les accents

[10]:
def cost(c1, c2):
    if c1 == c2:
        return 0
    if c1 in "eéêèë" and c2 in "eéêèë":
        return 0.5
    return 1


def distance_levenstein_alignment_accent(m1, m2):
    dist = np.zeros((len(m1) + 1, len(m2) + 1))
    pred = np.zeros((len(m1) + 1, len(m2) + 1))
    for i in range(len(m1) + 1):
        dist[i, 0] = i
        pred[i, 0] = 1
    for j in range(len(m2) + 1):
        dist[0, j] = j
        pred[0, j] = 2
    for i in range(1, len(m1) + 1):
        for j in range(1, len(m2) + 1):
            a = dist[i - 1, j] + 1
            b = dist[i, j - 1] + 1
            c = dist[i - 1, j - 1] + cost(m1[i - 1], m2[j - 1])
            if c <= min(a, b):
                dist[i, j] = c
                pred[i, j] = 3
            elif a <= b:
                dist[i, j] = a
                pred[i, j] = 1
            else:
                dist[i, j] = b
                pred[i, j] = 2

    i, j = len(m1), len(m2)
    positions = []
    chars = []
    while i != 0 or j != 0:
        positions.append((i, j))
        p = pred[i, j]
        if p == 1:
            chars.append((" ", m2[j - 1]))
            i -= 1
        elif p == 2:
            chars.append((m1[i - 1], " "))
            j -= 1
        else:
            chars.append((m1[i - 1], m2[j - 1]))
            i -= 1
            j -= 1

    positions.reverse()
    chars.reverse()
    return (
        dist[len(m1), len(m2)],
        positions,
        "".join(c[0] for c in chars),
        "".join(c[1] for c in chars),
    )


distance_levenstein_alignment_accent("rémunnérer", "renumérer")
[10]:
(3.5,
 [(1, 1),
  (2, 2),
  (3, 3),
  (4, 4),
  (5, 4),
  (6, 5),
  (7, 6),
  (8, 7),
  (9, 8),
  (10, 9)],
 'rému nérer',
 'renuumérer')

Notebook on github