Skip to content

Commit a872294

Browse files
perhapszzyperhapszzy
perhapszzy
authored and
perhapszzy
committed
add chapter9 examples
1 parent 38fba9b commit a872294

13 files changed

+843543
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 4,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import tensorflow as tf"
10+
]
11+
},
12+
{
13+
"cell_type": "markdown",
14+
"metadata": {},
15+
"source": [
16+
"#### 1. sparse_softmax_cross_entropy_with_logits样例。"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 7,
22+
"metadata": {},
23+
"outputs": [
24+
{
25+
"name": "stdout",
26+
"output_type": "stream",
27+
"text": [
28+
"[ 0.32656264 0.46436879]\n"
29+
]
30+
}
31+
],
32+
"source": [
33+
"# 假设词汇表的大小为3, 语料包含两个单词\"2 0\"\n",
34+
"word_labels = tf.constant([2, 0])\n",
35+
"\n",
36+
"# 假设模型对两个单词预测时,产生的logit分别是[2.0, -1.0, 3.0]和[1.0, 0.0, -0.5]\n",
37+
"predict_logits = tf.constant([[2.0, -1.0, 3.0], [1.0, 0.0, -0.5]])\n",
38+
"\n",
39+
"# 使用sparse_softmax_cross_entropy_with_logits计算交叉熵。\n",
40+
"loss = tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
41+
" labels=word_labels, logits=predict_logits)\n",
42+
"\n",
43+
"# 运行程序,计算loss的结果是[0.32656264, 0.46436879], 这对应两个预测的\n",
44+
"# perplexity损失。\n",
45+
"sess = tf.Session()\n",
46+
"print(sess.run(loss))\n"
47+
]
48+
},
49+
{
50+
"cell_type": "markdown",
51+
"metadata": {},
52+
"source": [
53+
"#### 2. softmax_cross_entropy_with_logits样例。"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 8,
59+
"metadata": {},
60+
"outputs": [
61+
{
62+
"name": "stdout",
63+
"output_type": "stream",
64+
"text": [
65+
"[ 0.32656264 0.46436879]\n",
66+
"[ 0.37656265 0.48936883]\n"
67+
]
68+
}
69+
],
70+
"source": [
71+
"# softmax_cross_entropy_with_logits与上面的函数相似,但是需要将预测目标以\n",
72+
"# 概率分布的形式给出。\n",
73+
"word_prob_distribution = tf.constant([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0]])\n",
74+
"loss = tf.nn.softmax_cross_entropy_with_logits(\n",
75+
" labels=word_prob_distribution, logits=predict_logits)\n",
76+
"# 运行结果与上面相同:[ 0.32656264, 0.46436879]\n",
77+
"print(sess.run(loss))\n",
78+
"\n",
79+
"# label smoothing:将正确数据的概率设为一个比1.0略小的值,将错误数据的概率\n",
80+
"# 设为比0.0略大的值,这样可以避免模型与数据过拟合,在某些时候可以提高训练效果。\n",
81+
"word_prob_smooth = tf.constant([[0.01, 0.01, 0.98], [0.98, 0.01, 0.01]])\n",
82+
"loss = tf.nn.softmax_cross_entropy_with_logits(\n",
83+
" labels=word_prob_smooth, logits=predict_logits)\n",
84+
"# 运行结果:[ 0.37656265, 0.48936883]\n",
85+
"print(sess.run(loss))\n",
86+
"\n",
87+
"sess.close()\n"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": null,
93+
"metadata": {},
94+
"outputs": [],
95+
"source": []
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"metadata": {},
101+
"outputs": [],
102+
"source": []
103+
}
104+
],
105+
"metadata": {
106+
"kernelspec": {
107+
"display_name": "Python 3",
108+
"language": "python",
109+
"name": "python3"
110+
},
111+
"language_info": {
112+
"codemirror_mode": {
113+
"name": "ipython",
114+
"version": 3
115+
},
116+
"file_extension": ".py",
117+
"mimetype": "text/x-python",
118+
"name": "python",
119+
"nbconvert_exporter": "python",
120+
"pygments_lexer": "ipython3",
121+
"version": "3.5.4"
122+
}
123+
},
124+
"nbformat": 4,
125+
"nbformat_minor": 1
126+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 6,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import codecs\n",
10+
"import collections\n",
11+
"from operator import itemgetter"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"#### 1. 设置参数。"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 19,
24+
"metadata": {},
25+
"outputs": [],
26+
"source": [
27+
"MODE = \"PTB\" # 将MODE设置为\"PTB\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n",
28+
"\n",
29+
"if MODE == \"PTB\": # PTB数据处理\n",
30+
" RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\" # 训练集数据文件\n",
31+
" VOCAB_OUTPUT = \"ptb.vocab\" # 输出的词汇表文件\n",
32+
"elif MODE == \"TRANSLATE_ZH\": # 翻译语料的中文部分\n",
33+
" RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n",
34+
" VOCAB_OUTPUT = \"zh.vocab\"\n",
35+
" VOCAB_SIZE = 4000\n",
36+
"elif MODE == \"TRANSLATE_EN\": # 翻译语料的英文部分\n",
37+
" RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n",
38+
" VOCAB_OUTPUT = \"en.vocab\"\n",
39+
" VOCAB_SIZE = 10000"
40+
]
41+
},
42+
{
43+
"cell_type": "markdown",
44+
"metadata": {},
45+
"source": [
46+
"#### 2.对单词按词频排序。"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 20,
52+
"metadata": {},
53+
"outputs": [],
54+
"source": [
55+
"counter = collections.Counter()\n",
56+
"with codecs.open(RAW_DATA, \"r\", \"utf-8\") as f:\n",
57+
" for line in f:\n",
58+
" for word in line.strip().split():\n",
59+
" counter[word] += 1\n",
60+
"\n",
61+
"# 按词频顺序对单词进行排序。\n",
62+
"sorted_word_to_cnt = sorted(\n",
63+
" counter.items(), key=itemgetter(1), reverse=True)\n",
64+
"sorted_words = [x[0] for x in sorted_word_to_cnt]"
65+
]
66+
},
67+
{
68+
"cell_type": "markdown",
69+
"metadata": {},
70+
"source": [
71+
"#### 3.插入特殊符号。"
72+
]
73+
},
74+
{
75+
"cell_type": "code",
76+
"execution_count": 21,
77+
"metadata": {},
78+
"outputs": [],
79+
"source": [
80+
"if MODE == \"PTB\":\n",
81+
" # 稍后我们需要在文本换行处加入句子结束符\"<eos>\",这里预先将其加入词汇表。\n",
82+
" sorted_words = [\"<eos>\"] + sorted_words\n",
83+
"elif MODE in [\"TRANSLATE_EN\", \"TRANSLATE_ZH\"]:\n",
84+
" # 在9.3.2小节处理机器翻译数据时,除了\"<eos>\"以外,还需要将\"<unk>\"和句子起始符\n",
85+
" # \"<sos>\"加入词汇表,并从词汇表中删除低频词汇。\n",
86+
" sorted_words = [\"<unk>\", \"<sos>\", \"<eos>\"] + sorted_words\n",
87+
" if len(sorted_words) > VOCAB_SIZE:\n",
88+
" sorted_words = sorted_words[:VOCAB_SIZE]"
89+
]
90+
},
91+
{
92+
"cell_type": "markdown",
93+
"metadata": {},
94+
"source": [
95+
"#### 4.保存词汇表文件。"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 22,
101+
"metadata": {},
102+
"outputs": [],
103+
"source": [
104+
"with codecs.open(VOCAB_OUTPUT, 'w', 'utf-8') as file_output:\n",
105+
" for word in sorted_words:\n",
106+
" file_output.write(word + \"\\n\")"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": null,
112+
"metadata": {},
113+
"outputs": [],
114+
"source": []
115+
}
116+
],
117+
"metadata": {
118+
"kernelspec": {
119+
"display_name": "Python 3",
120+
"language": "python",
121+
"name": "python3"
122+
},
123+
"language_info": {
124+
"codemirror_mode": {
125+
"name": "ipython",
126+
"version": 3
127+
},
128+
"file_extension": ".py",
129+
"mimetype": "text/x-python",
130+
"name": "python",
131+
"nbconvert_exporter": "python",
132+
"pygments_lexer": "ipython3",
133+
"version": "3.5.4"
134+
}
135+
},
136+
"nbformat": 4,
137+
"nbformat_minor": 2
138+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 9,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import codecs\n",
10+
"import sys"
11+
]
12+
},
13+
{
14+
"cell_type": "markdown",
15+
"metadata": {},
16+
"source": [
17+
"#### 1. 参数设置。"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 10,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"MODE = \"PTB_TRAIN\" # 将MODE设置为\"PTB_TRAIN\", \"PTB_VALID\", \"PTB_TEST\", \"TRANSLATE_EN\", \"TRANSLATE_ZH\"之一。\n",
27+
"\n",
28+
"if MODE == \"PTB_TRAIN\": # PTB训练数据\n",
29+
" RAW_DATA = \"../../datasets/PTB_data/ptb.train.txt\" # 训练集数据文件\n",
30+
" VOCAB = \"ptb.vocab\" # 词汇表文件\n",
31+
" OUTPUT_DATA = \"ptb.train\" # 将单词替换为单词编号后的输出文件\n",
32+
"elif MODE == \"PTB_VALID\": # PTB验证数据\n",
33+
" RAW_DATA = \"../../datasets/PTB_data/ptb.valid.txt\"\n",
34+
" VOCAB = \"ptb.vocab\"\n",
35+
" OUTPUT_DATA = \"ptb.valid\"\n",
36+
"elif MODE == \"PTB_TEST\": # PTB测试数据\n",
37+
" RAW_DATA = \"../../datasets/PTB_data/ptb.test.txt\"\n",
38+
" VOCAB = \"ptb.vocab\"\n",
39+
" OUTPUT_DATA = \"ptb.test\"\n",
40+
"elif MODE == \"TRANSLATE_ZH\": # 中文翻译数据\n",
41+
" RAW_DATA = \"../../datasets/TED_data/train.txt.zh\"\n",
42+
" VOCAB = \"zh.vocab\"\n",
43+
" OUTPUT_DATA = \"train.zh\"\n",
44+
"elif MODE == \"TRANSLATE_EN\": # 英文翻译数据\n",
45+
" RAW_DATA = \"../../datasets/TED_data/train.txt.en\"\n",
46+
" VOCAB = \"en.vocab\"\n",
47+
" OUTPUT_DATA = \"train.en\""
48+
]
49+
},
50+
{
51+
"cell_type": "markdown",
52+
"metadata": {},
53+
"source": [
54+
"#### 2.按词汇表对将单词映射到编号。"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": 11,
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# 读取词汇表,并建立词汇到单词编号的映射。\n",
64+
"with codecs.open(VOCAB, \"r\", \"utf-8\") as f_vocab:\n",
65+
" vocab = [w.strip() for w in f_vocab.readlines()]\n",
66+
"word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}\n",
67+
"\n",
68+
"# 如果出现了不在词汇表内的低频词,则替换为\"unk\"\n",
69+
"def get_id(word):\n",
70+
" return word_to_id[word] if word in word_to_id else word_to_id[\"<unk>\"]"
71+
]
72+
},
73+
{
74+
"cell_type": "markdown",
75+
"metadata": {},
76+
"source": [
77+
"#### 3.对数据进行替换并保存结果。"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": 12,
83+
"metadata": {},
84+
"outputs": [],
85+
"source": [
86+
"fin = codecs.open(RAW_DATA, \"r\", \"utf-8\")\n",
87+
"fout = codecs.open(OUTPUT_DATA, 'w', 'utf-8')\n",
88+
"for line in fin:\n",
89+
" words = line.strip().split() + [\"<eos>\"] # 读取单词并添加<eos>结束符\n",
90+
" # 将每个单词替换为词汇表中的编号\n",
91+
" out_line = ' '.join([str(get_id(w)) for w in words]) + '\\n'\n",
92+
" fout.write(out_line)\n",
93+
"fin.close()\n",
94+
"fout.close()\n"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"metadata": {},
101+
"outputs": [],
102+
"source": []
103+
}
104+
],
105+
"metadata": {
106+
"kernelspec": {
107+
"display_name": "Python 3",
108+
"language": "python",
109+
"name": "python3"
110+
},
111+
"language_info": {
112+
"codemirror_mode": {
113+
"name": "ipython",
114+
"version": 3
115+
},
116+
"file_extension": ".py",
117+
"mimetype": "text/x-python",
118+
"name": "python",
119+
"nbconvert_exporter": "python",
120+
"pygments_lexer": "ipython3",
121+
"version": "3.5.4"
122+
}
123+
},
124+
"nbformat": 4,
125+
"nbformat_minor": 2
126+
}

0 commit comments

Comments
 (0)