ML

输入名字识别男女开发全过程附实例

简述:

基于概率论的分类方法 :朴素贝叶斯

通过分析7000个人的名字,分别计算男女名字中含有某个的概率,

如果男孩子名字的概率大于女孩子的,则判断结果为男孩子。

算法部分

从数据库中读取名字和性别

# 从数据库中读取名字和性别
# 训练集数据为7000
def input_name():
    # 连接数据库
    conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
    cursor = conn.cursor()
    # 读取数据
    cursor.execute("SELECT * FROM student")
    names_data = cursor.fetchmany(7000)
    # 分类男女数据
    boy_str = ''
    girl_str = ''
    for i in names_data:
        # 去除姓分别拼接所有名字为字符串
        if i[2] == '男':
            boy_str = boy_str + i[1][1:3]
        else:
            girl_str = girl_str + i[1][1:3]
    # 保存为字典类型并返回
    sex_data = {'男': boy_str,
                '女': girl_str
                }
    # 提交,不然无法保存新建或者修改的数据
    conn.commit()
    # 关闭游标
    cursor.close()
    # 关闭连接
    conn.close()
    return sex_data

计算字符概率

# 计算字符概率
def probability(words):
    d = {}
    # 统计出现的字符个数
    for x in words:
        if x in d:
            d[x] = d[x] + 1
        else:
            d[x] = 1
    # 计算字符出现概率
    word_probability = {}
    for i in d:
        word_probability[i] = d[i] / 7000
    return word_probability

概率数据存入数据库

# 概率数据存入数据库
def save_mysql(word_data, sex):
    # 连接数据库
    conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
    cursor = conn.cursor()
    if sex == '男':
        # 循环存入数据
        sql = "INSERT INTO boy (word,probability) VALUES (%s,%s)"
        for i in word_data:
            cursor.execute(sql, (i, word_data[i]))
    else:
        # 循环存入数据
        sql = "INSERT INTO girl (word,probability) VALUES (%s,%s)"
        for i in word_data:
            cursor.execute(sql, (i, word_data[i]))
    # 提交,不然无法保存新建或者修改的数据
    conn.commit()
    # 关闭游标
    cursor.close()
    # 关闭连接
    conn.close()

训练部分完整代码

import pymysql

# 计算字符概率
def probability(words):
    d = {}
    # 统计出现的字符个数
    for x in words:
        if x in d:
            d[x] = d[x] + 1
        else:
            d[x] = 1
    # 计算字符出现概率
    word_probability = {}
    for i in d:
        word_probability[i] = d[i] / 7000
    return word_probability

# 从数据库中读取名字和性别
# 训练集数据为7000
def input_name():
    # 连接数据库
    conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
    cursor = conn.cursor()
    # 读取数据
    cursor.execute("SELECT * FROM student")
    names_data = cursor.fetchmany(7000)
    # 分类男女数据
    boy_str = ''
    girl_str = ''
    for i in names_data:
        # 去除姓分别拼接所有名字为字符串
        if i[2] == '男':
            boy_str = boy_str + i[1][1:3]
        else:
            girl_str = girl_str + i[1][1:3]
    # 保存为字典类型并返回
    sex_data = {'男': boy_str,
                '女': girl_str
                }
    # 提交,不然无法保存新建或者修改的数据
    conn.commit()
    # 关闭游标
    cursor.close()
    # 关闭连接
    conn.close()
    return sex_data

# 概率数据存入数据库
def save_mysql(word_data, sex):
    # 连接数据库
    conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
    cursor = conn.cursor()
    if sex == '男':
        # 循环存入数据
        sql = "INSERT INTO boy (word,probability) VALUES (%s,%s)"
        for i in word_data:
            cursor.execute(sql, (i, word_data[i]))
    else:
        # 循环存入数据
        sql = "INSERT INTO girl (word,probability) VALUES (%s,%s)"
        for i in word_data:
            cursor.execute(sql, (i, word_data[i]))
    # 提交,不然无法保存新建或者修改的数据
    conn.commit()
    # 关闭游标
    cursor.close()
    # 关闭连接
    conn.close()

if __name__ == '__main__':
    # 获取训练数据集
    name_data = input_name()
    # 统计男孩名字名字出现字的概率
    boy_probability = probability(name_data['男'])
    girl_probability = probability(name_data['女'])
    # 存入数据库
    save_mysql(boy_probability, '男')
    save_mysql(girl_probability, '女')

应用部分完整代码

import pymysql

# 读取数据库
def accept_mysql():
    # 连接数据库
    conn = pymysql.connect(host='localhost', port=3306, user='root', db='person_name', charset='utf8')
    # 读取男孩数据
    boy_cursor = conn.cursor()
    boy_cursor.execute("SELECT * FROM boy")
    boy_data = boy_cursor.fetchall()
    # 读取女孩数据
    girl_cursor = conn.cursor()
    girl_cursor.execute("SELECT * FROM girl")
    girl_data = girl_cursor.fetchall()
    result = {
        '男': boy_data,
        '女': girl_data
    }
    return result

# 分析读取名字
def read_name(words, mysql_data_boy, mysql_data_girl):
    boy_word_probability = 0
    girl_word_probability = 0
    for x in words:
        # 男孩
        for i in range(len(mysql_data_boy)):
            if x == mysql_data_boy[i][1]:
                # 如果名字是两个字以上,概率相加
                boy_word_probability = boy_word_probability + mysql_data_boy[i][2]
            # 女孩
        for i in range(len(mysql_data_girl)):
            if x == mysql_data_girl[i][1]:
                # 如果名字是两个字以上,概率相加
                girl_word_probability = girl_word_probability + mysql_data_girl[i][2]
    result = {
        '男': boy_word_probability,
        '女': girl_word_probability
    }
    return result

if __name__ == '__main__':
    raw_name = input()
    name = raw_name[1:]
    # 获取数据库数据
    mysql_data_boy = accept_mysql()['男']
    mysql_data_girl = accept_mysql()['女']
    # 分析名字
    boy_probability = read_name(name, mysql_data_boy, mysql_data_girl)['男']
    girl_probability = read_name(name, mysql_data_boy, mysql_data_girl)['女']
    if boy_probability > girl_probability:
        print('男孩子')
    if girl_probability > boy_probability:
        print("女孩子")
    if girl_probability == boy_probability:
        print("不可能!")

DEMO

项目DEMO地址:http://boy-girl.netlab.sunan.me

爬虫太多,服务已关闭
2020.03.21

陌风同学
微信公众号:MFlow 知乎:陌风小同学
查看“陌风同学”的所有文章 →

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注

相关推荐


Copyright 2015-2021 陌风同学 All Rights Reserved. 页面生成时间:5.568 秒.
 ICP证:鲁ICP备15022835号-1