7 Decision Trees in Python
Decision Trees¶
In which we talk about decision trees!
import numpy as np
import pandas
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn.model_selection import train_test_split
from sklearn import datasets
##New!
from sklearn import tree
#os.getcwd() ## run if things aren't working as expected!
import sklearn as sklearn
sklearn.__version__
'1.2.1'
##load our dataset
heloc= pandas.read_csv("data\Heloc_labeled.csv")
## look it over
heloc.head(10)
#heloc.shape
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Age | Sex | Income | HELOC | |
---|---|---|---|---|
0 | 30 | Female | 101000 | 0 |
1 | 25 | Male | 86000 | 0 |
2 | 20 | Male | 50000 | 0 |
3 | 26 | Male | 58000 | 0 |
4 | 18 | Female | 93000 | 0 |
5 | 38 | Male | 153000 | 0 |
6 | 61 | Male | 71000 | 1 |
7 | 27 | Male | 102000 | 0 |
8 | 38 | Male | 33000 | 1 |
9 | 42 | Female | 69000 | 0 |
heloc.shape[1]
4
## That Male/ Female split is going to be a problem
## Our data needs preprocessing:
## There are lots of ways to do this!
def sex_recode(sex):
"""
everything inside the quotes is documentation
"""
if sex =="Female":
return 1
elif sex == "Male":
return 0
else:
return 2
sex_recode("amy") ## TEst
2
heloc['Sex']=heloc['Sex'].apply(sex_recode)
heloc.head(5)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Age | Sex | Income | HELOC | |
---|---|---|---|---|
0 | 30 | 1 | 101000 | 0 |
1 | 25 | 0 | 86000 | 0 |
2 | 20 | 0 | 50000 | 0 |
3 | 26 | 0 | 58000 | 0 |
4 | 18 | 1 | 93000 | 0 |
## Let's split the X and Y:
X = heloc[['Age', 'Sex', 'Income']]
Y = heloc[['HELOC']]
Y.tail()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
HELOC | |
---|---|
495 | 1 |
496 | 1 |
497 | 0 |
498 | 0 |
499 | 1 |
#and build a model
tree_heloc= tree.DecisionTreeClassifier(max_depth= 2, random_state = 2021)
tree_heloc= tree_heloc.fit(X, Y)
tree.plot_tree(tree_heloc, filled = True, feature_names = ['Age', 'Sex', 'Income']);
?tree.DecisionTreeClassifier
## Let's predict on our data!
heloc_new= pandas.read_csv("data\Heloc_unlabeled.csv")
## fix that pesky Sex variable!
heloc_new['Sex']=heloc_new['Sex'].apply(sex_recode)
## look it over
heloc_new.head(3)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Age | Sex | Income | |
---|---|---|---|
0 | 25 | 1 | 45000 |
1 | 23 | 0 | 22000 |
2 | 50 | 1 | 17000 |
## Trying our model, and making a new set:
Y_new = tree_heloc.predict(heloc_new)
## I like to make a new data.frame:
heloc_p = heloc_new
heloc_p['Heloc']=Y_new ## add in the new column of predictions
## view!
heloc_p
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Age | Sex | Income | Heloc | |
---|---|---|---|---|
0 | 25 | 1 | 45000 | 0 |
1 | 23 | 0 | 22000 | 0 |
2 | 50 | 1 | 17000 | 0 |
3 | 28 | 1 | 38000 | 0 |
4 | 56 | 0 | 24000 | 1 |
5 | 50 | 1 | 24000 | 0 |
6 | 22 | 0 | 27000 | 0 |
7 | 22 | 0 | 40000 | 0 |
8 | 21 | 0 | 41000 | 0 |
9 | 29 | 1 | 45000 | 0 |
10 | 64 | 0 | 47000 | 1 |
11 | 47 | 1 | 48000 | 0 |
12 | 55 | 1 | 49000 | 0 |
13 | 25 | 0 | 54000 | 1 |
14 | 42 | 0 | 113000 | 1 |
15 | 40 | 1 | 118000 | 0 |
16 | 38 | 0 | 153000 | 1 |
17 | 63 | 0 | 156000 | 1 |
18 | 51 | 1 | 172000 | 0 |
19 | 51 | 0 | 43000 | 1 |