机器学习分类模型对比

代码都在 TensorFlow for Machine Intelligence

线性回归

个人认为比较适合输入与输出有明显关联的数据,比如商场客流和一年中的时间、天气、自然灾害、突发事件的组合的关系。

示例代码

数据在 http://people.sc.fsu.edu/~jburkardt/datasets/regression/x09.txt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import tensorflow as tf

W = tf.Variable(tf.zeros([2, 1], name="weights")) # 各参数权重
b = tf.Variable(0., name="bias") # 偏置


def inference(X):
return tf.matmul(X, W) + b # 体重和年龄的矩阵与参数 W 相乘再加偏置即为血脂


def loss(X, Y):
Y_predicted = inference(X)
return tf.reduce_sum(tf.squared_difference(Y, Y_predicted)) # 预测值与实际值相差的和


def inputs():
weight_age = [[84, 46], [73, 20], [65, 52], [70, 30], [76, 57], [69, 25], [63, 28], [72, 36], [79, 57], [75, 44],
[27, 24], [89, 31], [65, 52], [57, 23], [59, 60], [69, 48], [60, 34], [79, 51], [75, 50], [82, 34],
[59, 46], [67, 23], [85, 37], [55, 40], [63, 30]]
blood_fat_content = [354, 190, 405, 263, 451, 302, 288, 385, 402, 365, 209, 290, 346, 254, 395, 434, 220, 374, 308,
220, 311, 181, 274, 303, 244]

return tf.to_float(weight_age), tf.to_float(blood_fat_content)


def train(total_loss):
learning_rate = 0.0000001
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss) # 使用随机梯度下降来减小 total_loss


def evaluate(sess, X, Y):
print(sess.run(inference([[80., 25.]]))) # ~ 303
print(sess.run(inference([[65., 25.]]))) # ~ 256


with tf.Session() as sess:
tf.global_variables_initializer.run()
X, Y = inputs()
total_loss = loss(X, Y)
train_op = train(total_loss) # 因为变量没有加 trainable=False,所以会自动调整变量
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)

training_steps = 1000
for step in range(training_steps):
sess.run([train_op])
if step % 10 == 0:
print("loss: ", sess.run([total_loss]))

evaluate(sess, X, Y)

coord.request_stop()
coord.join(threads)
sess.close()

逻辑回归

适用于结果只有正反两种结果的推断,而且推断条件和结果没有线性关系

示例代码

主要变化是处理过程在之前的线性函数外面加了一层 sigmoid 函数,使本来输出的期望值变成了概率。

1
2
3
4
5
6
7
8
9
W = tf.Variable(tf.zeros([2, 1], name="weights"))

W = tf.Variable(tf.zeros([5, 1], name="weights"))

def inference(X):
return tf.matmul(X, W) + b

def inference(X):
return tf.sigmoid(tf.matmul(X, W) + b)

损失函数由计算 L2 变为了计算交叉熵

1
2
3
4
5
6
def loss(X, Y):
Y_predicted = inference(X)
return tf.reduce_sum(tf.squared_difference(Y, Y_predicted))

def loss(X, Y):
return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(tf.matmul(X, W) + b, Y))

数据在 Titanic: Machine Learning from Disaster

代码好像有问题,按正常的代码正确率才40%,最后损失负好几百,改成趋近于0训练,正确率提升了,后来改成往大的训练,竟然到54%。

找到了问题,计算交叉熵的时候示例代码中的顺序是反的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import tensorflow as tf
import os

W = tf.Variable(tf.zeros([5, 1]), name="weights")
b = tf.Variable(0., name="bias")


def combine_inputs(X):
return tf.matmul(X, W) + b


# P(Y = 1|X = x) = e^(x'β)/(1 + e^(x'β)) β 是最大似然估计
def inference(X):
return tf.sigmoid(combine_inputs(X))


# 计算 σ 交叉熵,然后进行了降维?交叉熵应该是对比两个物件之间的相似度,相似度的值越高,算出来的交叉熵越低?然后进行维归约后为随机梯度下降算法的训练指标
def loss(X, Y):
return tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=combine_inputs(X), logits=Y))


def read_csv(batch_size, file_name, record_defaults):
filename_queue = tf.train.string_input_producer([os.path.join(os.getcwd(), file_name)])

reader = tf.TextLineReader(skip_header_lines=1)
key, value = reader.read(filename_queue)

# 按 record_defaults 类型存储读到的 CSV 里面的 value
decoded = tf.decode_csv(value, record_defaults=record_defaults)

# 只转换前 batch_size 行数据
return tf.train.shuffle_batch(decoded,
batch_size=batch_size,
capacity=batch_size * 50,
min_after_dequeue=batch_size)


def inputs():
passenger_id, survived, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked = \
read_csv(100, "/home/liuzesen/Downloads/train.csv",
[[0.0], [0.0], [0], [""], [""], [0.0], [0.0], [0.0], [""], [0.0], [""], [""]])

# 把仓位等级对应到三个维度上,以防止不同等级在设置的时候被认为设置上了线性关系
is_first_class = tf.to_float(tf.equal(pclass, [1]))
is_second_class = tf.to_float(tf.equal(pclass, [2]))
is_third_class = tf.to_float(tf.equal(pclass, [3]))

# 用 0, 1 判断男女
gender = tf.to_float(tf.equal(sex, ["female"]))

# 把这些数据存入一个矩阵中,然后进行转置,这样每一列对应一个样本,之前每一列对应的是一种类型的数据,比如性别或者仓位
features = tf.transpose(tf.stack([is_first_class, is_second_class, is_third_class, gender, age]))
# 把生存结果重塑为 [100, 1] 的矩阵
survived = tf.reshape(survived, [100, 1])

return features, survived


def train(total_loss):
learning_rate = 0.01
# 梯度下降算法训练
return tf.train.GradientDescentOptimizer(learning_rate).minimize(total_loss)


def evaluate(sess, X, Y):
predicted = tf.cast(inference(X) > 0.5, tf.float32)
# 训练完还拿原来的数据再测一遍
print(sess.run(tf.reduce_mean(tf.cast(tf.equal(predicted, Y), tf.float32))))


with tf.Session() as sess:
tf.global_variables_initializer().run()

X, Y = inputs()

total_loss = loss(X, Y)
train_op = train(total_loss)

coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)

training_steps = 1000
for step in range(training_steps):
sess.run([train_op])
if step % 10 == 0:
print("loss: ", sess.run([total_loss]))

evaluate(sess, X, Y)

import time

time.sleep(5)

coord.request_stop()
coord.join(threads)
sess.close()

归一化指数分类

分享