From 639a6a09e716ac6f0014e80bbff4fff47e855b6a Mon Sep 17 00:00:00 2001 From: Sergey Melderis Date: Sat, 31 Mar 2018 22:43:19 -0400 Subject: [PATCH] idf_ setter for TfidfTransformer. Fixes #7102 --- sklearn/feature_extraction/tests/test_text.py | 10 ++++++++++ sklearn/feature_extraction/text.py | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 22fbd3f17bea8..8beadbb53ec33 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -942,6 +942,16 @@ def test_pickling_transformer(): orig.fit_transform(X).toarray()) +def test_transformer_idf_setter(): + X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) + orig = TfidfTransformer().fit(X) + copy = TfidfTransformer() + copy.idf_ = orig.idf_ + assert_array_equal( + copy.fit_transform(X).toarray(), + orig.fit_transform(X).toarray()) + + def test_non_unique_vocab(): vocab = ['a', 'b', 'c', 'a', 'a'] vect = CountVectorizer(vocabulary=vocab) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9b1ebd6320b29..b231f20b463a9 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1062,6 +1062,12 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): sublinear_tf : boolean, default=False Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). + Attributes + ---------- + idf_ : array, shape = [n_features], or None + The learned idf vector (global term weights) + when ``use_idf`` is set to True, None otherwise. + References ---------- @@ -1157,6 +1163,13 @@ def idf_(self): # which means hasattr(self, "idf_") is False return np.ravel(self._idf_diag.sum(axis=0)) + @idf_.setter + def idf_(self, value): + value = np.asarray(value, dtype=np.float64) + n_features = value.shape[0] + self._idf_diag = sp.spdiags(value, diags=0, m=n_features, + n=n_features, format='csr') + class TfidfVectorizer(CountVectorizer): """Convert a collection of raw documents to a matrix of TF-IDF features.