Note
Go to the end to download the full example code
Traceable n-grams with tf-idf#
The notebook looks into the way n-grams are stored in CountVectorizer and TfidfVectorizer and how the current storage (<= 0.21) is ambiguous in some cases.
Example with CountVectorizer#
scikit-learn version#
import numpy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from mlinsights.mlmodel.sklearn_text import (
TraceableCountVectorizer,
TraceableTfidfVectorizer,
)
corpus = numpy.array(
[
"This is the first document.",
"This document is the second document.",
"Is this the first document?",
"",
]
).reshape((4,))
mod1 = CountVectorizer(ngram_range=(1, 2))
mod1.fit(corpus)
mod1.transform(corpus).todense()
matrix([[1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
[2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0],
[1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
{'this': 12, 'is': 4, 'the': 9, 'first': 2, 'document': 0, 'this is': 14, 'is the': 5, 'the first': 10, 'first document': 3, 'second': 7, 'this document': 13, 'document is': 1, 'the second': 11, 'second document': 8, 'is this': 6, 'this the': 15}
corpus = numpy.array(
[
"This is the first document.",
"This document is the second document.",
"Is this the first document?",
"",
]
).reshape((4,))
mod2 = TraceableCountVectorizer(ngram_range=(1, 2))
mod2.fit(corpus)
mod2.transform(corpus).todense()
matrix([[1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0],
[2, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0],
[1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
{('this',): 12, ('is',): 4, ('the',): 9, ('first',): 2, ('document',): 0, ('this', 'is'): 14, ('is', 'the'): 5, ('the', 'first'): 10, ('first', 'document'): 3, ('second',): 7, ('this', 'document'): 13, ('document', 'is'): 1, ('the', 'second'): 11, ('second', 'document'): 8, ('is', 'this'): 6, ('this', 'the'): 15}
The new class does the exact same thing but keeps n-grams in a more explicit form. The original form as a string is sometimes ambiguous as next example shows.
Funny example with TfidfVectorizer#
scikit-learn version#
corpus = numpy.array(
[
"This is the first document.",
"This document is the second document.",
"Is this the first document?",
"",
]
).reshape((4,))
mod1 = TfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}")
mod1.fit(corpus)
mod1.transform(corpus).todense()
matrix([[0. , 0. , 0.32940523, 0.32940523, 0. ,
0. , 0. , 0. , 0.25970687, 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0. , 0. , 0. , 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0.25970687, 0.25970687, 0.25970687, 0. ,
0.32940523, 0. , 0. ],
[0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0.24528087, 0.24528087, 0. , 0. ,
0.24528087, 0.24528087, 0. , 0. , 0. ,
0. , 0. , 0. , 0.24528087, 0. ,
0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0. , 0. , 0.19338226, 0.24528087,
0. , 0.24528087, 0.24528087],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.25453384, 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0.3228439 ,
0.3228439 , 0.3228439 , 0.3228439 , 0. , 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0. ,
0. , 0.25453384, 0.25453384, 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ]])
{'this': 28, ' is ': 2, 'the ': 26, 'firs': 12, 't do': 22, 'cume': 8, 'nt': 19, 'this is ': 30, ' is the ': 3, 'the firs': 27, 'firs t do': 13, 't do cume': 23, 'cume nt': 9, ' doc': 0, 'umen': 31, 't is': 24, ' the': 6, ' sec': 4, 'ond ': 20, 'docu': 10, 'ment': 18, 'this doc': 29, ' doc umen': 1, 'umen t is': 32, 't is the': 25, ' the sec': 7, ' sec ond ': 5, 'ond docu': 21, 'docu ment': 11, 'is t': 16, 'his ': 14, 'is t his ': 17, 'his the ': 15}
mlinsights version#
mod2 = TraceableTfidfVectorizer(ngram_range=(1, 2), token_pattern="[a-zA-Z ]{1,4}")
mod2.fit(corpus)
mod2.transform(corpus).todense()
matrix([[0. , 0. , 0.32940523, 0.32940523, 0. ,
0. , 0. , 0. , 0.25970687, 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0. , 0. , 0. , 0.25970687,
0. , 0. , 0.25970687, 0.25970687, 0. ,
0. , 0.25970687, 0.25970687, 0.25970687, 0. ,
0.32940523, 0. , 0. ],
[0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0.24528087, 0.24528087, 0. , 0. ,
0.24528087, 0.24528087, 0. , 0. , 0. ,
0. , 0. , 0. , 0.24528087, 0. ,
0.24528087, 0.24528087, 0. , 0. , 0.24528087,
0.24528087, 0. , 0. , 0.19338226, 0.24528087,
0. , 0.24528087, 0.24528087],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0.25453384, 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0.3228439 ,
0.3228439 , 0.3228439 , 0.3228439 , 0. , 0.25453384,
0. , 0. , 0.25453384, 0.25453384, 0. ,
0. , 0.25453384, 0.25453384, 0. , 0. ,
0. , 0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. ]])
{('this',): 28, (' is ',): 2, ('the ',): 26, ('firs',): 12, ('t do',): 22, ('cume',): 8, ('nt',): 19, ('this', ' is '): 30, (' is ', 'the '): 3, ('the ', 'firs'): 27, ('firs', 't do'): 13, ('t do', 'cume'): 23, ('cume', 'nt'): 9, (' doc',): 0, ('umen',): 31, ('t is',): 24, (' the',): 6, (' sec',): 4, ('ond ',): 20, ('docu',): 10, ('ment',): 18, ('this', ' doc'): 29, (' doc', 'umen'): 1, ('umen', 't is'): 32, ('t is', ' the'): 25, (' the', ' sec'): 7, (' sec', 'ond '): 5, ('ond ', 'docu'): 21, ('docu', 'ment'): 11, ('is t',): 16, ('his ',): 14, ('is t', 'his '): 17, ('his ', 'the '): 15}
As you can see, the original 30th n-grams 't is the'
is a little
but ambiguous. It is in fact ('t is', ' the')
as the
TraceableTfidfVectorizer lets you know. The original form could have
been ('t', 'is the')
, ('t is', ' the')
, ('t is ', ' the')
,
('t is ', 'the')
, ('t', 'is ', 'the')
… The regular
expression gives some insights but not some information which can be
easily used to guess the right one.
Total running time of the script: (0 minutes 0.028 seconds)