fasttext文本分类

fastText1

Install

如果直接用 pip install fasttext会报一堆错误,所以用以下方法安装:

1
2
3
$ git clone https://github.com/facebookresearch/fastText.git
$ cd fastText
$ pip install .

这样安装后会出现在powershell中可以import fasttext,但在Anaconda Prompt中或者pycharm中无法import的问题。报错:ImportError:DLL load failed

解决方案是conda中activate 另一个环境,pip install fasttext,这时会发现base中已经安装好了,不用重新下载,直接安装,发现是可以的。

之后回到base,pip uninstall fasttext, 再重新 pip install fasttext 根据新环境中已经安装好的包,重新安装。

fastText2

fastText model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class Model(nn.Module):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.embedding_ngram2 = nn.Embedding(config.n_gram_vocab, config.embed)
self.embedding_ngram3 = nn.Embedding(config.n_gram_vocab, config.embed)
self.dropout = nn.Dropout(config.dropout)
self.fc1 = nn.Linear(config.embed * 3, config.hidden_size)
# self.dropout2 = nn.Dropout(config.dropout)
self.fc2 = nn.Linear(config.hidden_size, config.num_classes)

def forward(self, x):

out_word = self.embedding(x[0])
out_bigram = self.embedding_ngram2(x[2])
out_trigram = self.embedding_ngram3(x[3])
out = torch.cat((out_word, out_bigram, out_trigram), -1)

out = out.mean(dim=1)
out = self.dropout(out)
out = self.fc1(out)
out = F.relu(out)
out = self.fc2(out)
return out

nn.Embedding, 可以看做一个字典,字典中每个索引对应一个词和词的embedding形式。利用这个模块,可以给词做embedding的初始化操作。 padding_idx 如果给定,则遇到padding_idx中的索引,则将其位置填0(0是默认值)。

其中 config.n_gram_vocab会在运行过程中赋值,config.n_vocab = len(vocab),config.n_gram_vocab = 250499

  1. 模型输入: [batch_size, seq_len]

  2. embedding层:随机初始化, 词向量维度为embed_size,2-gram和3-gram同理:
    word: [batch_size, seq_len, embed_size]
    2-gram:[batch_size, seq_len, embed_size]
    3-gram:[batch_size, seq_len, embed_size]

  3. 拼接embedding层:
    [batch_size, seq_len, embed_size * 3]

  4. 求所有seq_len个词的均值
    [batch_size, embed_size * 3]

  5. 全连接+非线性激活:隐层大小hidden_size
    [batch_size, hidden_size]

  6. 全连接+softmax归一化:
    [batch_size, num_class]==>[batch_size, 1]

build_dataset

1
2
3
4
5
6
7
8
9
10
11
build_dataset
if ues_word:
tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level
else:
tokenizer = lambda x: [y for y in x] # char-level
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open(config.vocab_path, 'wb'))
print(f"Vocab size: {len(vocab)}")

如果路径有词表,直接加载,如果没有则生成词表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def build_vocab(file_path, tokenizer, max_size, min_freq):
vocab_dic = {}
with open(file_path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content = lin.split('\t')[0]
for word in tokenizer(content):
vocab_dic[word] = vocab_dic.get(word, 0) + 1
# 按照字/词频降序排列 [(' ', 80926), ('0', 60319), ('1', 40420)]
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
# 根据上面的词频降序列表升序定义id {' ': 0, '0': 1, '1': 2}
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
return vocab_dic

生成的词表按照词频降序排列,从0开始赋值,eg:{‘市民’: 0, ‘反映’: 1, ‘问题’: 2, ‘来电’: 3, ‘希望’: 4}

最后添加 UNK, PAD = ‘\‘, ‘\‘,所以之前fastText定义Embedding时,会有:self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1),padding_idx=config.n_vocab - 1表示当遇到\时,用0填充embedding。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
build_dataset

def biGramHash(sequence, t, buckets):
t1 = sequence[t - 1] if t - 1 >= 0 else 0
return (t1 * 14918087) % buckets

def triGramHash(sequence, t, buckets):
t1 = sequence[t - 1] if t - 1 >= 0 else 0
t2 = sequence[t - 2] if t - 2 >= 0 else 0
return (t2 * 14918087 * 18408749 + t1 * 14918087) % buckets

def load_dataset(path, pad_size=32):
contents = []
with open(path, 'r', encoding='UTF-8') as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
label = label.replace('"', "")
words_line = []
token = tokenizer(content)
seq_len = len(token)
if pad_size:
if len(token) < pad_size:
token.extend([PAD] * (pad_size - len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
# word to id
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))

# fasttext ngram
buckets = config.n_gram_vocab
bigram = []
trigram = []
# ------ngram------
for i in range(pad_size):
bigram.append(biGramHash(words_line, i, buckets))
trigram.append(triGramHash(words_line, i, buckets))
# -----------------
contents.append((words_line, int(label), seq_len, bigram, trigram))
return contents # [([...], 0), ([...], 1), ...]

train = load_dataset(config.train_path, config.pad_size)
dev = load_dataset(config.dev_path, config.pad_size)
test = load_dataset(config.test_path, config.pad_size)
return vocab, train, dev, test

load_dataset: 首先对于数据集的每一行操作,根据’\t’分割content和标签,label = label.replace('"', "")是因为某些分割后的标签后会出现一个引号,原因还不明白。之后根据pad_size少补多截,seq_len表示内容的词数(如果超过pad_size则为pad_size,如果小于则为实际长度)

words_line储存该句所有单词的编号,如果找不到则为UNK。

14918087这些数字,就是用几个质数做的简单的哈希函数,可以修改,目的就是把所有n-gram都映射到一个词表中。

最后训练时通过 words_line,bigram,trigram找到对应在词表embedding、2gram embedding、3gram embedding中的位置,进行平均

DatasetIterater

1
iter = DatasetIterater(dataset, config.batch_size, config.device)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class DatasetIterater(object):
def __init__(self, batches, batch_size, device):
self.batch_size = batch_size
self.batches = batches
self.n_batches = len(batches) // batch_size
self.residue = False # 记录batch数量是否为整数
if len(batches) % self.n_batches != 0:
self.residue = True
self.index = 0
self.device = device

def _to_tensor(self, datas):
# xx = [xxx[2] for xxx in datas]
# indexx = np.argsort(xx)[::-1]
# datas = np.array(datas)[indexx]
x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
bigram = torch.LongTensor([_[3] for _ in datas]).to(self.device)
trigram = torch.LongTensor([_[4] for _ in datas]).to(self.device)

# pad前的长度(超过pad_size的设为pad_size)
seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
return (x, seq_len, bigram, trigram), y
"""
对应的是train中,
for i, (trains, labels) in enumerate(train_iter):
outputs = model(trains)
"""

def __next__(self):
# 如果是最后一个batch且长度不足batch_size
if self.residue and self.index == self.n_batches:
batches = self.batches[self.index * self.batch_size: len(self.batches)]
self.index += 1
batches = self._to_tensor(batches)
return batches

elif self.index >= self.n_batches:
self.index = 0
raise StopIteration
else:
batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
self.index += 1
batches = self._to_tensor(batches)
return batches

def __iter__(self):
return self

def __len__(self):
if self.residue:
return self.n_batches + 1
else:
return self.n_batches