泰坦尼克号生存预测

举报
Nikolas 发表于 2020/12/27 18:37:07 2020/12/27
【摘要】 使用sklearn进行泰坦尼克号生存预测

## 1.导入需要的包


```python
import warnings
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier as XGBC
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
```

## 2.设置自定义参数


```python
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
plt.rcParams['font.sans-serif'] = ['simHei']
plt.rcParams['axes.unicode_minus'] = False
```

## 3.定义训练模型


```python
class titanic():

    def __init__(self):
        '''导入数据'''
        self.train = pd.read_csv('./data/titanic_train.csv')
        self.test = pd.read_csv('./data/titanic_test.csv')
        self.train.drop('PassengerId', axis=1, inplace=True)
        self.sex_map = {'male': 1, 'female': 0}
        self.title_map = {
            "Capt": "Officer",
            "Col": "Officer",
            "Major": "Officer",
            "Jonkheer": "Royalty",
            "Don": "Royalty",
            "Sir": "Royalty",
            "Dr": "Officer",
            "Rev": "Officer",
            "the Countess": "Royalty",
            "Dona": "Royalty",
            "Mme": "Mrs",
            "Mlle": "Miss",
            "Ms": "Mrs",
            "Mr": "Mr",
            "Mrs": "Mrs",
            "Miss": "Miss",
            "Master": "Master",
            "Lady": "Royalty"
        }

    '''数据预处理'''

    def process(self, data):
        data['Age'].fillna(data['Age'].median(), inplace=True)  # 年龄缺失用中位数填补
        data['Fare'].fillna(data['Fare'].mean(), inplace=True)  # 票价缺失用均值填补
        data['Embarked'].fillna(data['Embarked'].mode(), inplace=True)  # 码头缺失用众数填补
        data['Cabin'].fillna('U', inplace=True)  # 座位号用U(未知)填补
        data['Sex'] = data['Sex'].map(self.sex_map)  # 性别映射成0, 1
        embarked = pd.get_dummies(data['Embarked'], prefix='Embarked')  # one_hot编码
        data = pd.concat([data, embarked], axis=1)  # 合并数据
        data.drop('Embarked', axis=1, inplace=True)  # 删除列
        pclass = pd.get_dummies(data['Pclass'], prefix='Pclass')
        data = pd.concat([data, pclass], axis=1)
        data.drop('Pclass', axis=1, inplace=True)
        title = pd.DataFrame()
        title['Title'] = data['Name'].map(self.get_title)
        title['Title'] = title['Title'].map(self.title_map)
        title = pd.get_dummies(title['Title'], prefix='Title')
        data = pd.concat([data, title], axis=1)
        data.drop('Name', axis=1, inplace=True)
        data['Cabin'] = data['Cabin'].map(lambda c: c[0])
        cabin = pd.get_dummies(data['Cabin'], prefix='Cabin')
        data = pd.concat([data, cabin], axis=1)
        data.drop('Cabin', axis=1, inplace=True)
        family = pd.DataFrame()
        family['FamilySize'] = data['Parch'] + data['SibSp'] + 1  # 对家庭成员数量进行分箱
        family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s == 1 else 0)
        family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
        family['Family_Single'] = family['FamilySize'].map(lambda s: 1 if s >= 5 else 0)
        data = pd.concat([data, family], axis=1)
        data['Ticket'] = data['Ticket'].map(self.clean_ticket)
        ticket = pd.get_dummies(data['Ticket'], prefix='Ticket')
        data = pd.concat([data, ticket], axis=1)
        data.drop('Ticket', axis=1, inplace=True)
        return data

    '''提取头衔'''

    def get_title(self, name):
        str1 = name.split(',')[1]
        title = str1.split('.')[0]
        return title.strip()

    '''提取票号'''

    def clean_ticket(self, ticket):
        ticket = ticket.replace('.', '')
        ticket = ticket.replace('/', '')
        ticket = ticket.split()
        ticket = map(lambda t: t.strip(), ticket)
        ticket = list(filter(lambda t: not t.isdigit(), ticket))  # 取出票号中的字母
        if len(ticket) > 0:
            return ticket[0]
        else:
            return 'N'

    '''逻辑回归'''

    def Logistic(self, x, y):
        lr = LogisticRegression()
        score = cross_val_score(lr, x, y, cv=10).mean()
        print('Logistic:', score)

    '''决策树'''

    def DecisionTree(self, x, y):
        dt = DecisionTreeClassifier(random_state=0)
        score = cross_val_score(dt, x, y, cv=10).mean()
        print('DecisionTree:', score)

    '''随机森林'''

    def RandomForest(self, x, y):
        rfc = RandomForestClassifier(n_estimators=25)
        score = cross_val_score(rfc, x, y, cv=10).mean()
        print('FandomForest:', score)

    '''支持向量机'''

    def SVC(self, x, y):
        Kernel = ['linear', 'poly', 'rbf', 'sigmoid']
        for kernel in Kernel:
            clf = SVC(kernel=kernel
                      , gamma='auto'
                      , degree=1
                      , cache_size=5000
                      )
            score = cross_val_score(clf, x, y, cv=5).mean()
            print('SVC {}:'.format(kernel), score)

    '''XGBoost'''

    def XGB(self, x, y):
        xgb = XGBC(n_estimators=100)
        score = cross_val_score(xgb, x, y, cv=10).mean()
        print('XGB:', score)

    '''朴素贝叶斯'''

    def Naive_bayes(self, x, y):
        bnb = BernoulliNB()
        score = cross_val_score(bnb, x, y).mean()
        print('Naive_bayes:', score)

    '''梯度提升决策树'''

    def GBDT(self, x, y):
        gbdt = GradientBoostingClassifier()
        score = cross_val_score(gbdt, x, y).mean()
        print('GBDT:', score)

    '''K近邻'''

    def KNN(self, x, y):
        knn = KNeighborsClassifier()
        score = cross_val_score(knn, x, y).mean()
        print('KNN:', score)

    '''调用各个模型'''
    def main(self):
        data = self.process(self.train)
        # targets = pd.read_csv('./data/train.csv', usecols=['Survived'])['Survived'].values
        # print(targets)
        # corr = data.corr()
        # print(corr['Survived'].sort_values(ascending=False))

        x = data.iloc[:, 1:]
        y = data.iloc[:, 0]
        x = StandardScaler().fit_transform(x)
        self.Logistic(x, y)
        self.DecisionTree(x, y)
        self.RandomForest(x, y)
        self.SVC(x, y)
        self.XGB(x, y)
        self.Naive_bayes(x, y)
        self.GBDT(x, y)
        self.KNN(x, y)


if __name__ == '__main__':
    t = titanic()
    t.main()
```

【声明】本内容来自华为云开发者社区博主,不代表华为云及华为云开发者社区的观点和立场。转载时必须标注文章的来源(华为云社区)、文章链接、文章作者等基本信息,否则作者和本社区有权追究责任。如果您发现本社区中有涉嫌抄袭的内容,欢迎发送邮件进行举报,并提供相关证据,一经查实,本社区将立刻删除涉嫌侵权内容,举报邮箱: cloudbbs@huaweicloud.com
  • 点赞
  • 收藏
  • 关注作者

评论(0

0/1000
抱歉,系统识别当前为高风险访问,暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称,即可参与社区互动!

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。

*长度不超过10个汉字或20个英文字符,设置后3个月内不可修改。