In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
url = 'https://assets.datacamp.com/production/repositories/5952/datasets/6f60d0b204b01be5cb9541b3e4e1dcda02f20868/Databel%20-%20Data.csv'
df = pd.read_csv(url)
print(df.info())
total_customers = len(pd.unique(df['Customer ID']))
print('# of unique Customer ID = ', total_customers)
In [2]:
# create different grouping
# age bins
ranges = [0, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 74, 79, 84, np.inf]
age_groups = ['0-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44',
'45-49', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']
df['Age group'] = pd.cut(df['Age'], bins = ranges, labels=age_groups)
print(df[['Age', 'Age group']].head())
# data usage bins
data_ranges = [-np.inf, 4, 9, np.inf]
data_groups = ['0-4', '5-9', '10+']
df['Data group'] = pd.cut(df['Avg Monthly GB Download'], bins = data_ranges, labels=data_groups)
print(df[['Unlimited Data Plan','Avg Monthly GB Download', 'Data group']].head())
In [3]:
# review data
print(df.describe())
print(df.head())
In [4]:
# review what kind of unique category values in columns
print("Churn Label =",pd.unique(df["Churn Label"]))
print("Intl Active =",pd.unique(df["Intl Active"]))
print("Intl Plan =",pd.unique(df["Intl Plan"]))
print("Group =",pd.unique(df["Group"]))
print("Contract Type =",pd.unique(df["Contract Type"]))
print("Payment Method =",pd.unique(df["Payment Method"]))
print("Churn Category =",pd.unique(df["Churn Category"]))
print("Churn Reason =",pd.unique(df["Churn Reason"]))
In [5]:
# clean up data for consistency
# add Churn column to calculate # of churned customers and churn rate
df['Intl Plan'] = df['Intl Plan'].replace(['no', 'yes'],['No','Yes'])
df['Churn'] = df['Churn Label']
df['Churn'] = df['Churn'].replace(["No", "Yes"],[0, 1])
print('# of Churned customers =', df['Churn'].sum())
print('Total # of Customers = ', total_customers)
print('Churned rate = ', ("{:.2%}".format(df['Churn'].sum() / total_customers)))
In [6]:
# this block of code fixes the issue where some churn customers do not have
# a churn reason or churn category in the original data. i assigned those
# NaN to "No info given" so the total churn customers will be the same regardless
# if the # is calculated based on churn reason / churn category / churn label
# #learned .loc assignment can change the database in place!
condition = (df['Churn Label'] == "Yes") & (df['Churn Category'].isnull())
df.loc[condition, ['Churn Category', 'Churn Reason']] = 'No info given'
In [7]:
# review churn cateogry using groupby
churn_by_category = df.groupby("Churn Category")['Churn'].sum().reset_index()
print(churn_by_category)
fig, ax = plt.subplots(figsize=(6,6))
ax.pie(churn_by_category['Churn'], labels = churn_by_category['Churn Category'], autopct='%1.1f%%')
ax.set_title('Churn Category')
plt.show()
In [8]:
fig, ax = plt.subplots(figsize=(12,12))
print(df['Churn Reason'].value_counts())
churn_by_reason = df.groupby("Churn Reason")['Churn'].sum()
churn_by_reason_sorted = churn_by_reason.sort_values(ascending = True)
print(type(churn_by_reason_sorted))
print(churn_by_reason_sorted.sum())
churn_by_reason_sorted = churn_by_reason_sorted / churn_by_reason_sorted.sum() * 100
churn_by_reason_sorted.plot(kind='barh')
ax.set_xlabel('Percentage of total')
#ax.set_xticklabels(churn_by_reason_sorted.index, rotation = 90)
plt.show()
In [9]:
# this section is to separate out churn and Age group, and add churn rate
# calculate churned customers by age group using pivot table
df_age_group = df.pivot_table(values="Churn", index = "Age group", columns = "Churn Label", aggfunc='sum').reset_index()
df_age_group.columns = ('Age group', 'drop', 'Churn customers')
# df_age_group = df_age_group['Age group', 'Churn customers']
print(df_age_group)
# all customers by age group
customer_by_age_group = df.groupby('Age group')['Customer ID'].count().reset_index()
customer_by_age_group.columns = ('Age group', 'Number of customers')
# avg customer service call by age group
avg_service_call_by_age_group = df.groupby('Age group')['Customer Service Calls'].mean().reset_index()
# avg monthly bill by age group
avg_monthly_bill_by_age_group = df.groupby('Age group')['Monthly Charge'].mean().reset_index()
print(avg_service_call_by_age_group)
print(avg_monthly_bill_by_age_group)
df_age_group = df_age_group.merge(customer_by_age_group, how = 'left', left_on = 'Age group', right_on =
"Age group").merge(avg_service_call_by_age_group, how = 'left', left_on = 'Age group',
right_on = "Age group").merge(avg_monthly_bill_by_age_group, how = 'left', left_on = 'Age group',
right_on = "Age group")
df_age_group = df_age_group[['Age group', 'Churn customers', 'Number of customers', 'Monthly Charge', 'Customer Service Calls']]
print(df_age_group['Number of customers'].sum())
df_age_group['Churn customers'] = df_age_group['Churn customers'].fillna(0)
df_age_group['Churn rate in %'] = df_age_group['Churn customers'] / df_age_group['Number of customers'] * 100
print(df_age_group)
# plot dual axis plot with churn rate on one side and # of customers on the other
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('Age group')
ax1.xaxis.set(ticks=range(0, 16), ticklabels=df_age_group['Age group'])
ax1.tick_params(labelrotation=45)
ax1.set_ylabel('# of customers', color=color)
ax1.bar(df_age_group.index, df_age_group['Number of customers'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
# instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Percentage %', color=color)
ax2.plot(df_age_group.index, df_age_group['Churn rate in %'], color=color)
ax2.tick_params(axis='y', labelcolor=color)
# otherwise the right y-label is slightly clipped
fig.tight_layout()
plt.show()
In [10]:
# plot dual axis plot with churn rate on one side and monthly charge on the other
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('Age group')
ax1.xaxis.set(ticks=range(0, 16), ticklabels=df_age_group['Age group'])
ax1.tick_params(labelrotation=45)
ax1.set_ylabel('Avg. monthly charge $', color=color)
ax1.bar(df_age_group.index, df_age_group['Monthly Charge'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
# instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Percentage %', color=color)
ax2.plot(df_age_group.index, df_age_group['Churn rate in %'], color=color)
ax2.tick_params(axis='y', labelcolor=color)
# otherwise the right y-label is slightly clipped
fig.tight_layout()
plt.show()
In [11]:
# group by senior vs. non-senior
churn_by_senior = df.groupby('Senior')['Churn'].sum()
customer_by_senior = df.groupby('Senior')['Customer ID'].count()
df_senior = pd.concat([churn_by_senior, customer_by_senior], axis = 1).reset_index()
df_senior['Churn rate %'] = df_senior['Churn'] / df_senior['Customer ID'] * 100
df_senior.columns = ['Senior', 'Churn', '# of customers', 'Churn rate %']
df_senior['% of total'] = df_senior['# of customers'] / df_senior['# of customers'].sum() * 100
df_senior = df_senior[['Senior', '# of customers', '% of total', 'Churn', 'Churn rate %']]
print(df_senior)
In [12]:
Group_customer = df.groupby("Number of Customers in Group")['Customer ID'].count()
Group_churn = df.groupby("Number of Customers in Group")['Churn'].sum()
Group_avg_call = df.groupby("Number of Customers in Group")['Customer Service Calls'].mean()
Group_avg_mo_charge = df.groupby("Number of Customers in Group")['Monthly Charge'].mean()
df_Group = pd.concat([Group_customer, Group_churn, Group_avg_call, Group_avg_mo_charge], axis = 1)
df_Group.columns = ["# of customers", "Churn", "Avg. service calls", "Avg. monthly charge"]
df_Group.index = ["No group", "2", "3", "4", "5", "6" ]
df_Group["Churn rate %"] = df_Group["Churn"] / df_Group["# of customers"] * 100
print(df_Group)
# note this is cleaner execution to combine all the info, instead of using using pivot and add groupby columns
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('# of customers in a group')
ax1.xaxis.set(ticks=range(0, 5), ticklabels=df_Group.index)
# ax1.tick_params(labelrotation=45)
ax1.set_ylabel('# of customers', color=color)
ax1.bar(df_Group.index, df_Group['# of customers'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
# instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Churn percentage', color=color)
ax2.plot(df_Group.index, df_Group['Churn rate %'], color=color)
ax2.tick_params(axis='y', labelcolor=color)
# otherwise the right y-label is slightly clipped
#fig.tight_layout()
plt.show()
In [13]:
# service call by # of customers in a group
print(df_Group)
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('# of customers in a group')
ax1.xaxis.set(ticks=range(0, 5), ticklabels=df_Group.index)
# ax1.tick_params(labelrotation=45)
ax1.set_ylabel('Avg. service calls', color=color)
ax1.bar(df_Group.index, df_Group['Avg. service calls'], color=color)
ax1.tick_params(axis='y', labelcolor=color)
# instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Churn percentage %', color=color)
ax2.plot(df_Group.index, df_Group['Churn rate %'], color=color)
ax2.tick_params(axis='y', labelcolor=color)
# otherwise the right y-label is slightly clipped
fig.tight_layout()
plt.show()
In [14]:
# review customer and churn by contract type
print('Customer contract type - \n', df['Contract Type'].value_counts())
customer_by_contract = df.groupby("Contract Type")['Customer ID'].count()
fig = plt.figure(figsize=(12,12))
ax1 = fig.add_subplot(121)
ax1.pie(customer_by_contract, labels = customer_by_contract.index, autopct ='%1.1f%%')
ax1.set(title = 'All customers contract type')
churn_by_contract = df.groupby("Contract Type")['Churn'].sum()
print('Churned customer contract type - \n', churn_by_contract)
ax2 = fig.add_subplot(122)
ax2.pie(churn_by_contract, labels = churn_by_contract.index, autopct='%1.1f%%')
ax2.set(title = 'Churned customers contract type')
plt.show()
In [15]:
#review churn category / reason composition
df_churn_group = pd.DataFrame(df.groupby(['Churn Category','Churn Reason'])['Churn'].sum())
df_churn_group['% of total'] = df_churn_group['Churn'] / df_churn_group['Churn'].sum() * 100
print(df_churn_group)
In [16]:
#customer by state review
state_customer = df.groupby(['State'])['Customer ID'].count()
state_churn = df.groupby(['State'])['Churn'].sum()
df_by_state = pd.concat([state_customer, state_churn], axis = 1)
df_by_state.columns = (['# of customers', 'Churn'])
df_by_state['Churn rate %'] = df_by_state['Churn'] / df_by_state['# of customers'] * 100
df_by_state.sort_values('Churn rate %', inplace=True, ascending=False)
print(df_by_state.head())
df_by_state.sort_values('# of customers', inplace=True, ascending=False)
print(df_by_state.head())
In [17]:
# international plan and international call active review
intl_plan_customer = df.groupby(['Intl Plan', 'Intl Active'])['Customer ID'].count()
intl_plan_churn = df.groupby(['Intl Plan', 'Intl Active'])['Churn'].sum()
intl_plan_ext_charge = df.groupby(['Intl Plan', 'Intl Active'])['Extra International Charges'].mean()
intl_plan_monthly_bill = df.groupby(['Intl Plan', 'Intl Active'])['Monthly Charge'].mean()
df_plan = pd.concat([intl_plan_customer, intl_plan_churn, intl_plan_ext_charge, intl_plan_monthly_bill], axis = 1)
df_plan['Churn rate'] = df_plan['Churn'] / df_plan['Customer ID'] * 100
df_plan.columns = ['# of customers', 'Churn', 'Avg. extra Intl charge', 'Avg. monthly charge', 'Churn rate %']
print(df_plan)
In [18]:
df_plan.loc[:,'Churn rate %'].reset_index()
Out[18]:
In [19]:
# review data usage group in relation to data plan
data_plan_customer = df.groupby(['Data group','Unlimited Data Plan'])['Customer ID'].count()
data_plan_churn = df.groupby(['Data group','Unlimited Data Plan'])['Churn'].sum()
data_plan_data_usage = df.groupby(['Data group','Unlimited Data Plan'])['Avg Monthly GB Download'].mean()
data_plan_ext_charge = df.groupby(['Data group','Unlimited Data Plan'])['Extra Data Charges'].mean()
data_plan_avg_charge = df.groupby(['Data group','Unlimited Data Plan'])['Monthly Charge'].mean()
df_data = pd.concat([data_plan_customer, data_plan_churn, data_plan_data_usage, data_plan_ext_charge,
data_plan_avg_charge], axis = 1)
df_data.columns = ['# of customers', 'Churn', 'Avg. monthly GB download', 'Avg. extra data charges', 'Avg. monthly charges']
df_data['Churn rate %'] = df_data['Churn'] / df_data['# of customers'] * 100
print(df_data)
In [20]:
df_temp = df_data.reset_index()
print(df_temp)
fig = plt.figure(figsize=(6,6))
colors = ('#1f77b4', 'orange')
ax1 = fig.add_subplot(131)
ax1.set_xlabel('Unlimited plan \n 0-4 GB')
ax1.set_ylabel('Churn %')
ax1.bar(df_temp[df_temp['Data group'] == '0-4']['Unlimited Data Plan'], df_temp[df_temp['Data group'] == '0-4']
['Churn rate %'], color=colors)
ax1.tick_params(axis='y')
ax1.set_ylim(0, 35)
ax2 = fig.add_subplot(132)
ax2.set_xlabel('Unlimited plan \n 5-9 GB')
ax2.bar(df_temp[df_temp['Data group'] == '5-9']['Unlimited Data Plan'], df_temp[df_temp['Data group'] == '5-9']
['Churn rate %'], color=colors)
ax2.set_ylim(0, 35)
ax3 = fig.add_subplot(133)
ax3.set_xlabel('Unlimited plan \n 10+ GB')
ax3.bar(df_temp[df_temp['Data group'] == '10+']['Unlimited Data Plan'], df_temp[df_temp['Data group'] == '10+']
['Churn rate %'], color=colors)
ax3.set_ylim(0, 35)
plt.suptitle('Avg. monthly data usage')
plt.show()
In [21]:
# review detail unlimited data plan churn reason
data_plan_churn = df.groupby(['Unlimited Data Plan','Churn Reason'])['Churn'].sum().reset_index()
data_plan_churn['Churn rate %'] = data_plan_churn['Churn'] / total_customers * 100
print(data_plan_churn)
In [22]:
#review relationship between contract type, payment method versus account duration
contract_length = df.groupby(['Contract Type','Payment Method'])['Account Length (in months)'].mean()
contract_churn = df.groupby(['Contract Type','Payment Method'])['Churn'].sum()
contract_customer = df.groupby(['Contract Type','Payment Method'])['Customer ID'].count()
contract_monthly_charge = df.groupby(['Contract Type','Payment Method'])['Monthly Charge'].mean()
contract_df = pd.concat([contract_customer, contract_churn, contract_length, contract_monthly_charge],
axis = 1).reset_index()
contract_df.columns = ['Contract Type', 'Payment Method', "# of Customers", 'Churn', 'Avg. Acct Length',
'Avg. Monthly Charge']
contract_df['Churn rate %'] = contract_df['Churn'] / contract_df['# of Customers'] * 100
print(contract_df)
fig,ax1 = plt.subplots()
ax1.scatter(contract_df['Avg. Acct Length'], contract_df['Churn rate %'],
s = contract_df['Churn'], color =['red', 'red', 'red', 'blue', 'blue', 'blue', 'violet', 'violet', 'violet'])
ax1.set_ylabel('Churn rate %')
ax1.text(10, 20,'Month-to-Month', color = 'red')
ax1.text(35, 20,'One Year', color = 'blue')
ax1.text(50, 5,'Two Year', color = 'blue')
ax1.set_title('Churn rate by account length by payment type')
ax2 = ax1.twinx()
ax2.scatter(contract_df['Avg. Acct Length'], contract_df['Avg. Monthly Charge'], color = 'orange')
ax2.set_ylabel('Avg. monthly charge $', color = 'orange')
plt.show()
No comments:
Post a Comment