forked from takuti/prelims
/
recommender.py
114 lines (94 loc) · 4.24 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from .base import BaseFrontMatterProcessor
import os
from urllib.parse import urljoin
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class Recommender(BaseFrontMatterProcessor):
"""Tokenize contents, extract keywords, and generate a list of recommended
article paths.
Parameters
----------
permalink_base : str, default=''
Non-file name part of your article permalink. For example, if an
article saved under ``/path/to/articles/artcile-aaa.md`` eventually
becomes accessible from a URL
``https://awesome-website.com/diary/post/article-aaa/``, it means
``permalink_base='/diary/post'``; the permalink is a concatination of
the base and file name ``article-aaa``, excluding the extension.
topk : int, default=3
Number of recommended articles.
**kwargs : dict
Keyword arguments to be used to initialize sklearn's
``TfidfVectorizer``.
Notes
-----
One possibility of using ``**kwargs`` is to explicitly provide
``stop_words`` so the processor can filter out meaningless terms (e.g.,
'is', 'this', 'of'). Besides sklearn's ``stop_words='english'``, you could
leverage an arbitrary set of words; if you have installed ``spacy`` in your
environment, for example, their stop word list will give wider coverage of
words.
Examples
--------
>>> from prelims import Post
>>> from prelims.processor import Recommender
>>> from spacy.lang import en
>>> post_a = Post('/path/to/posts/a.md', {'title': 'foo'},
... '---\ntitle: foo\n---\n\nHellow world.',
... 'Hello world.')
>>> post_b = Post('/path/to/posts/b.md', {'title': 'bar'},
... '---\ntitle: bar\n---\n\nThis is a pen.',
... 'This is a pen.')
>>> recommender = Recommender(permalink_base='/posts',
... stop_words=en.STOP_WORDS)
>>> recommender.process([post_a, post_b])
>>> post_a.front_matter.keywords
['world', 'hello', 'pen']
>>> post_a.front_matter.recommendations
['/posts/b/']
>>> post_b.front_matter.keywords
['pen', 'world', 'hello']
>>> post_b.front_matter.recommendations
['/posts/a/']
"""
def __init__(self, permalink_base='', topk=3, **kwargs):
self.permalink_base = permalink_base
self.topk = topk
vectorizer_kwargs = TfidfVectorizer.__init__.__kwdefaults__
for arg, value in kwargs.items():
if arg in vectorizer_kwargs:
vectorizer_kwargs[arg] = value
self.vectorizer_kwargs = vectorizer_kwargs
def process(self, posts):
"""Extract keywords and generate a list of recommended articles
based on the content-based filtering technique.
"""
contents = [post.content for post in posts]
paths = [post.path for post in posts]
# build model
vectorizer = TfidfVectorizer(**self.vectorizer_kwargs)
tfidf = vectorizer.fit_transform(contents)
indices = tfidf.toarray().argsort(axis=1, kind='stable')[:, ::-1]
keywords = np.array(vectorizer.get_feature_names_out())[indices]
similarities = cosine_similarity(tfidf)
for i in range(len(contents)):
# find top-k most-similar articles
# (except for target article itself which is similarity=1.0)
top_indices = np.argsort(
similarities[i, :], kind='stable')[::-1][1:(self.topk + 1)]
recommend_permalinks = [
self.__path_to_permalink(paths[j]) for j in top_indices
]
posts[i].update('keywords', keywords[i, :10].tolist(),
allow_overwrite=True)
posts[i].update('recommendations', recommend_permalinks,
allow_overwrite=True)
def __path_to_permalink(self, path):
"""Convert a file path into a permalink, which is a part of final URL
excluding a file extension.
"""
file, _ = os.path.splitext(os.path.basename(path))
if file == 'index':
file = os.path.basename(os.path.dirname(path))
return urljoin(f'{self.permalink_base}/', f'{file}/')