输入名字识别男女开发全过程附实例

简述:

基于概率论的分类方法 :朴素贝叶斯

通过分析7000个人的名字,分别计算男女名字中含有某个的概率,

如果男孩子名字的概率大于女孩子的,则判断结果为男孩子。

算法部分

从数据库中读取名字和性别

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# 从数据库中读取名字和性别
# 训练集数据为7000
def input_name():
# 连接数据库
conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
cursor = conn.cursor()
# 读取数据
cursor.execute("SELECT * FROM student")
names_data = cursor.fetchmany(7000)
# 分类男女数据
boy_str = ''
girl_str = ''
for i in names_data:
# 去除姓分别拼接所有名字为字符串
if i[2] == '男':
boy_str = boy_str + i[1][1:3]
else:
girl_str = girl_str + i[1][1:3]
# 保存为字典类型并返回
sex_data = {'男': boy_str,
'女': girl_str
}
# 提交,不然无法保存新建或者修改的数据
conn.commit()
# 关闭游标
cursor.close()
# 关闭连接
conn.close()
return sex_data

计算字符概率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 计算字符概率
def probability(words):
d = {}
# 统计出现的字符个数
for x in words:
if x in d:
d[x] = d[x] + 1
else:
d[x] = 1
# 计算字符出现概率
word_probability = {}
for i in d:
word_probability[i] = d[i] / 7000
return word_probability

概率数据存入数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 概率数据存入数据库
def save_mysql(word_data, sex):
# 连接数据库
conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
cursor = conn.cursor()
if sex == '男':
# 循环存入数据
sql = "INSERT INTO boy (word,probability) VALUES (%s,%s)"
for i in word_data:
cursor.execute(sql, (i, word_data[i]))
else:
# 循环存入数据
sql = "INSERT INTO girl (word,probability) VALUES (%s,%s)"
for i in word_data:
cursor.execute(sql, (i, word_data[i]))
# 提交,不然无法保存新建或者修改的数据
conn.commit()
# 关闭游标
cursor.close()
# 关闭连接
conn.close()

训练部分完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pymysql


# 计算字符概率
def probability(words):
d = {}
# 统计出现的字符个数
for x in words:
if x in d:
d[x] = d[x] + 1
else:
d[x] = 1
# 计算字符出现概率
word_probability = {}
for i in d:
word_probability[i] = d[i] / 7000
return word_probability


# 从数据库中读取名字和性别
# 训练集数据为7000
def input_name():
# 连接数据库
conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
cursor = conn.cursor()
# 读取数据
cursor.execute("SELECT * FROM student")
names_data = cursor.fetchmany(7000)
# 分类男女数据
boy_str = ''
girl_str = ''
for i in names_data:
# 去除姓分别拼接所有名字为字符串
if i[2] == '男':
boy_str = boy_str + i[1][1:3]
else:
girl_str = girl_str + i[1][1:3]
# 保存为字典类型并返回
sex_data = {'男': boy_str,
'女': girl_str
}
# 提交,不然无法保存新建或者修改的数据
conn.commit()
# 关闭游标
cursor.close()
# 关闭连接
conn.close()
return sex_data


# 概率数据存入数据库
def save_mysql(word_data, sex):
# 连接数据库
conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
cursor = conn.cursor()
if sex == '男':
# 循环存入数据
sql = "INSERT INTO boy (word,probability) VALUES (%s,%s)"
for i in word_data:
cursor.execute(sql, (i, word_data[i]))
else:
# 循环存入数据
sql = "INSERT INTO girl (word,probability) VALUES (%s,%s)"
for i in word_data:
cursor.execute(sql, (i, word_data[i]))
# 提交,不然无法保存新建或者修改的数据
conn.commit()
# 关闭游标
cursor.close()
# 关闭连接
conn.close()


if __name__ == '__main__':
# 获取训练数据集
name_data = input_name()
# 统计男孩名字名字出现字的概率
boy_probability = probability(name_data['男'])
girl_probability = probability(name_data['女'])
# 存入数据库
save_mysql(boy_probability, '男')
save_mysql(girl_probability, '女')

应用部分完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pymysql


# 读取数据库
def accept_mysql():
# 连接数据库
conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
# 读取男孩数据
boy_cursor = conn.cursor()
boy_cursor.execute("SELECT * FROM boy")
boy_data = boy_cursor.fetchall()
# 读取女孩数据
girl_cursor = conn.cursor()
girl_cursor.execute("SELECT * FROM girl")
girl_data = girl_cursor.fetchall()
result = {
'男': boy_data,
'女': girl_data
}
return result


# 分析读取名字
def read_name(words, mysql_data_boy, mysql_data_girl):
boy_word_probability = 0
girl_word_probability = 0
for x in words:
# 男孩
for i in range(len(mysql_data_boy)):
if x == mysql_data_boy[i][1]:
# 如果名字是两个字以上,概率相加
boy_word_probability = boy_word_probability + mysql_data_boy[i][2]
# 女孩
for i in range(len(mysql_data_girl)):
if x == mysql_data_girl[i][1]:
# 如果名字是两个字以上,概率相加
girl_word_probability = girl_word_probability + mysql_data_girl[i][2]
result = {
'男': boy_word_probability,
'女': girl_word_probability
}
return result


if __name__ == '__main__':
raw_name = input()
name = raw_name[1:]
# 获取数据库数据
mysql_data_boy = accept_mysql()['男']
mysql_data_girl = accept_mysql()['女']
# 分析名字
boy_probability = read_name(name, mysql_data_boy, mysql_data_girl)['男']
girl_probability = read_name(name, mysql_data_boy, mysql_data_girl)['女']
if boy_probability > girl_probability:
print('男孩子')
if girl_probability > boy_probability:
print("女孩子")
if girl_probability == boy_probability:
print("不可能!")

DEMO

项目DEMO地址:http://boy-girl.netlab.sunan.me

爬虫太多,服务已关闭
2020.03.21