HeidelbergCement Material Strength

Exploratory analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
import plotly.io as pio
pio.renderers.default='notebook'

Import of data and libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

path = "C:/Users/HP/OneDrive/Documents/Python Anaconda/Heidelberg Cement"
os.chdir(path)

data = pd.read_excel('interview_dataset.xlsx', sheet_name='Sheet1')
data.head()
Out[3]:
X_0 X_1 X_2 X_3 X_4 X_5 X_6 X_7 X_8 X_9 ... X_18 X_19 y_0 y_1 y_2 y_3 y_4 y_5 X_26 X_28
0 1.81 NaN 0.00 0.00 0.00 0.00 0.00 0.49 0.15 2.86 ... NaN NaN 59.3 60.1 60.3 61.4 59.8 59.7 2012-02-14 10:00:00 0---200
1 2.64 0.87 20.24 4.93 3.04 65.02 1.28 0.59 0.13 2.73 ... 7.3 8.3 49.3 48.4 48.6 48.4 48.7 50.9 2012-04-02 10:00:00 0---200
2 2.77 0.95 0.00 0.00 0.00 0.00 0.00 0.42 0.13 2.70 ... 7.4 7.7 47.6 47.4 48.2 47.0 48.3 47.1 2012-04-03 10:00:00 0---200
3 2.70 0.88 0.00 0.00 0.00 0.00 0.00 0.60 0.14 2.68 ... 8.0 7.4 47.3 47.5 48.8 47.8 48.1 47.8 2012-04-10 10:00:00 0---200
4 2.67 1.02 0.00 0.00 0.00 0.00 0.00 0.59 0.14 2.74 ... 7.6 7.9 51.2 49.7 51.0 48.6 50.9 50.4 2012-04-11 10:00:00 0---200

5 rows × 28 columns

Summary

In [4]:
data.describe()
Out[4]:
X_0 X_1 X_2 X_3 X_4 X_5 X_6 X_7 X_8 X_9 ... X_16 X_17 X_18 X_19 y_0 y_1 y_2 y_3 y_4 y_5
count 4002.000000 8495.000000 8054.000000 8054.000000 8054.000000 8054.000000 8054.000000 8500.000000 8500.000000 8500.000000 ... 8490.000000 4179.000000 8490.000000 8495.000000 8459.000000 8459.000000 8459.000000 8458.000000 8458.000000 8458.000000
mean 5.362351 0.494819 29.396692 29.412113 28.714113 24.888622 11.082288 0.636089 0.168848 2.817500 ... 55.202273 6.442402 5.101873 7.519882 56.697115 56.716669 56.714316 56.783471 56.665642 56.701289
std 2.430134 0.302909 42.462445 42.671716 42.720091 33.306719 17.309875 0.222255 0.073127 0.149969 ... 11.888348 2.411407 1.361524 1.802390 6.364591 6.343017 6.387708 9.366963 6.402062 8.409899
min 0.560000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.480000 ... 0.000000 0.000000 0.000000 0.000000 38.000000 36.700000 38.800000 38.100000 38.200000 34.600000
25% 2.320000 0.260000 0.000000 0.000000 0.000000 0.000000 0.000000 0.550000 0.130000 2.710000 ... 52.500000 4.600000 4.400000 6.600000 52.500000 52.600000 52.500000 52.600000 52.400000 52.300000
50% 6.650000 0.450000 0.000000 0.000000 0.000000 0.000000 0.000000 0.650000 0.140000 2.800000 ... 57.000000 6.600000 5.200000 7.600000 56.900000 57.000000 57.000000 57.000000 56.900000 56.900000
75% 7.190000 0.680000 92.100000 92.130000 91.360000 53.067500 16.942500 0.770000 0.210000 2.910000 ... 64.900000 8.200000 5.900000 8.700000 59.000000 59.000000 59.000000 59.100000 59.000000 59.000000
max 8.660000 2.460000 99.010000 103.240000 96.880000 91.840000 59.050000 1.120000 0.360000 3.700000 ... 73.600000 13.500000 11.700000 14.100000 77.400000 77.000000 77.300000 689.000000 76.500000 557.000000

8 rows × 26 columns

Missing values

In [5]:
plt.figure(figsize=(15, 10))
heatmap = sns.heatmap(data.isnull(),yticklabels=False,cbar=False,cmap='viridis')
bottom, top = heatmap.get_ylim()
heatmap.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

Data correlation

In [6]:
plt.figure(figsize=(18, 10))
heatmap = sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)
bottom, top = heatmap.get_ylim()
heatmap.set_ylim(bottom + 0.5, top - 0.5)
plt.show()

Distribution

In [7]:
data.hist(bins=40, figsize=(15, 15))
plt.show()
In [8]:
sns.set_style('whitegrid') # selecting a particular variable
sns.distplot(data['X_12'], bins=20) # CHANGE
plt.show()

Feature engineering

In [9]:
print(data['X_26'].value_counts()) # X_26 separate time and date
data['time'] = pd.to_datetime(data['X_26'], format='%Y:%M:%D').dt.time
data['date'] = pd.to_datetime(data['X_26'], format='%Y:%M:%D').dt.date
data = data.drop('X_26', 1)
2013-05-27 10:00:00    9
2012-11-26 10:00:00    9
2014-09-30 10:00:00    9
2012-04-02 10:00:00    9
2014-01-08 10:00:00    9
                      ..
2017-01-11 10:46:00    1
2017-09-04 08:15:00    1
2016-02-29 07:55:00    1
2015-04-20 13:01:00    1
2018-04-03 08:34:00    1
Name: X_26, Length: 3885, dtype: int64
In [10]:
import plotly.express as px # material strength over time
fig = px.scatter(data, x='date', y='y',
                 title="Material strength (y) through the time",
                 template="simple_white")
fig.show()
In [11]:
print(data['X_28'].value_counts()) # X_28 from a string into two categories
data[['X_28','X_29']] = data.X_28.str.split("---",expand=True,)
data = data[['X_0', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9',
       'X_10', 'X_11', 'X_12', 'y', 'X_14', 'X_15', 'X_16', 'X_17', 'X_18',
       'X_19', 'y_0', 'y_1', 'y_2', 'y_3', 'y_4', 'y_5', 'date',  'time', 'X_28',
       'X_29']]
6---200    975
5---200    966
7---200    966
1---200    966
2---250    966
3---150    964
4---200    964
8---150    924
0---200    809
Name: X_28, dtype: int64
In [12]:
import plotly.express as px # separation of trends based on one category
fig = px.scatter(data, x='date', y='y', facet_row="X_29",
                 title="Material strength (y) through the time",
                 template="simple_white")
fig.show()
In [13]:
import plotly.express as px # separation of trends based on one category
fig = px.scatter(data, x='date', y='y', facet_col="X_29",
                 title="Material strength (y) through the time",
                 template="simple_white")
fig.show()
In [14]:
print(data['X_14'].value_counts()) # X_14 deleting of a column with NAs only
data = data.drop('X_14', 1)
Series([], Name: X_14, dtype: int64)
In [15]:
print(data['X_12'].value_counts()) # X_12 simple linear regression for imputation
X_12 = data[['X_11', 'X_12']]
X_12 = X_12.dropna(how = 'any')

from sklearn.model_selection import train_test_split
X = X_12[['X_11']]
y = X_12['X_12']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1001)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
model = LinearRegression() # linear regression for getting the coefficient
model.fit(X_train,y_train)
coefficient = pd.DataFrame(model.coef_,X.columns)
coefficient.columns = ['Coefficient']
coefficient = pd.DataFrame(coefficient)

coef = round(coefficient['Coefficient'][0],4)
inter = round(model.intercept_,4)
y_pred = model.predict(X_test)
print('Coefficient: ', round(coefficient['Coefficient'][0],4))
print('Intercept: ', round(model.intercept_,4))
print('Mean absolute error: ',round(mean_absolute_error(y_test, y_pred),4))
250.0    39
260.0    38
240.0    32
225.0    29
270.0    29
255.0    28
245.0    27
280.0    23
220.0    22
265.0    22
285.0    17
275.0    17
230.0    16
200.0    15
235.0    15
300.0    15
215.0    15
290.0    15
320.0    13
295.0    12
195.0    11
190.0     9
185.0     8
310.0     8
180.0     7
205.0     7
335.0     6
340.0     6
210.0     5
325.0     5
175.0     4
170.0     4
330.0     3
315.0     2
355.0     1
360.0     1
145.0     1
390.0     1
305.0     1
160.0     1
370.0     1
155.0     1
Name: X_12, dtype: int64
Coefficient:  1.1044
Intercept:  25.9863
Mean absolute error:  15.8969
In [16]:
plt.figure(figsize=(12,6)) # visualisation of regression for imputation
plt.scatter(X, y, alpha=0.9)

x = np.array(X_12.X_11)

# Fit function
f = lambda x: coef*x + inter
# Plot fit
plt.plot(x,f(x),lw=2.5, c="r", label="predicted X_12")
plt.xlabel("X_11")
plt.ylabel("X_12")
plt.legend()
plt.show()
In [17]:
def impute_X_12(cols): # imputation with help of the coefficient
    X_11 = cols[0]
    X_12 = cols[1]

    if pd.isnull(X_12):
        return X_11 * coef
    else:
        return X_12

data['X_12'] = round(data[['X_11','X_12']].apply(impute_X_12, axis=1),1)
print(data['X_12'].count())
8457
In [18]:
print(data['X_0'].value_counts()) # X_0 deleting of a column with lot of NAs
data = data.drop('X_0', 1)
2.09    34
2.16    33
6.89    33
6.81    33
6.75    32
        ..
8.31     1
6.14     1
5.91     1
5.37     1
5.53     1
Name: X_0, Length: 440, dtype: int64
In [19]:
print(data['X_17'].value_counts()) # X_17 mean imputation
mean = data['X_17'].mean()
mean

def impute_X_17(cols):
    X_17 = cols[0]

    if pd.isnull(X_17):
        return mean
    else:
        return X_17

data['X_17'] = round(data[['X_17']].apply(impute_X_17, axis=1),1)
print(data['X_17'].count())
8.1     81
7.9     76
8.2     74
8.7     71
8.0     69
        ..
13.5     1
0.5      1
0.9      1
13.4     1
13.3     1
Name: X_17, Length: 127, dtype: int64
8500
In [20]:
# REST of NAs deleted
data = data.dropna()
data
Out[20]:
X_1 X_2 X_3 X_4 X_5 X_6 X_7 X_8 X_9 X_10 ... y_0 y_1 y_2 y_3 y_4 y_5 date time X_28 X_29
1 0.87 20.24 4.93 3.04 65.02 1.28 0.59 0.13 2.73 25.0 ... 49.3 48.4 48.6 48.4 48.7 50.9 2012-04-02 10:00:00 0 200
2 0.95 0.00 0.00 0.00 0.00 0.00 0.42 0.13 2.70 26.0 ... 47.6 47.4 48.2 47.0 48.3 47.1 2012-04-03 10:00:00 0 200
3 0.88 0.00 0.00 0.00 0.00 0.00 0.60 0.14 2.68 25.0 ... 47.3 47.5 48.8 47.8 48.1 47.8 2012-04-10 10:00:00 0 200
4 1.02 0.00 0.00 0.00 0.00 0.00 0.59 0.14 2.74 25.0 ... 51.2 49.7 51.0 48.6 50.9 50.4 2012-04-11 10:00:00 0 200
5 0.94 0.00 0.00 0.00 0.00 0.00 0.63 0.14 2.63 25.0 ... 48.5 48.1 48.1 48.0 48.6 47.4 2012-04-16 10:00:00 0 200
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
8495 0.20 0.00 0.00 0.00 50.47 53.29 1.02 0.29 3.05 31.0 ... 46.6 46.4 44.3 42.6 42.6 40.0 2020-07-15 08:45:00 8 150
8496 0.50 0.00 0.00 0.00 49.92 54.74 1.05 0.30 3.06 31.0 ... 45.1 44.2 43.6 44.3 44.6 43.6 2020-07-20 08:25:00 8 150
8497 0.26 0.00 0.00 0.00 50.52 54.95 1.06 0.29 3.19 31.0 ... 46.1 42.1 44.5 43.1 42.6 42.6 2020-07-22 08:30:00 8 150
8498 0.26 0.00 0.00 0.00 49.75 54.81 1.12 0.30 3.16 31.0 ... 45.7 45.8 46.4 46.5 47.2 46.8 2020-07-27 08:10:00 8 150
8499 0.23 0.00 0.00 0.00 50.41 54.44 1.10 0.29 3.19 31.0 ... 49.7 48.7 48.5 48.8 48.2 48.6 2020-07-29 08:15:00 8 150

7965 rows × 28 columns

Models

Model1 - Multiple Linear Regression X

In [21]:
data.columns # splitting data into train and test set
df = data[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10',
       'X_11', 'X_12', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_28', 'X_29', 'y']]

from sklearn.model_selection import train_test_split
X = df[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_7', 'X_8', 'X_9', 'X_10',
       'X_11', 'X_12', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_28', 'X_29']]
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1001)
In [22]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression() # multiple linear regression model
model1.fit(X_train, y_train)
coefficient = pd.DataFrame(model1.coef_,X.columns)
coefficient.columns = ['Coefficient']
coefficient = pd.DataFrame(coefficient)

y_pred = model1.predict(X_test)

outcome = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
outcome['difference'] = outcome['y_test'] - outcome['y_pred']
outcome['difference_percentage'] = round(outcome.difference/(outcome.y_test/100),6)
print('Percentage difference: ', round(outcome.difference_percentage.abs().mean(),2),'%')

coef = round(coefficient['Coefficient'][0],4)
inter = round(model1.intercept_,4)
y_pred = model1.predict(X_test)
print('Coefficients: ','\n', round(coefficient['Coefficient'],4))

print('Intercept: ', round(model1.intercept_,4))
print('Mean absolute error: ',round(mean_absolute_error(y_test, y_pred),4))
Percentage difference:  2.65 %
Coefficients:  
 X_1     0.2729
X_2    -0.0223
X_3     0.0287
X_4    -0.0155
X_5     0.0147
X_7    -2.2038
X_8     7.5746
X_9     0.4527
X_10    0.4633
X_11   -0.0269
X_12   -0.0032
X_15    0.0009
X_16    0.0533
X_17   -0.0615
X_18    0.4665
X_19    0.3095
X_28    0.0737
X_29    0.1180
Name: Coefficient, dtype: float64
Intercept:  13.9522
Mean absolute error:  1.4655

Model2 - Multiple Linear Regression y Error

In [23]:
data.columns # splitting data into train and test set
df = data[['y_0', 'y_1', 'y_2', 'y_3', 'y_4', 'y_5', 'y']]

df['e_0'] = df['y_0'] - df['y']
df['e_1'] = df['y_1'] - df['y']
df['e_2'] = df['y_2'] - df['y']
df['e_3'] = df['y_3'] - df['y']
df['e_4'] = df['y_4'] - df['y']
df['e_5'] = df['y_5'] - df['y']

df['e'] = df[['e_0', 'e_1', 'e_2', 'e_3', 'e_4', 'e_5']].mean(axis=1)

from sklearn.model_selection import train_test_split
X = df[['y_0', 'y_1', 'y_2', 'y_3', 'y_4', 'y_5']]
y = df['e']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1001)
In [24]:
from sklearn.linear_model import LinearRegression
model2 = LinearRegression() # multiple linear regression model
model2.fit(X_train, y_train)
coefficient = pd.DataFrame(model2.coef_,X.columns)
coefficient.columns = ['Coefficient']
coefficient = pd.DataFrame(coefficient)

y_pred = model2.predict(X_test)

coef = round(coefficient['Coefficient'][0],4)
inter = round(model2.intercept_,4)
y_pred = model2.predict(X_test)
print('Coefficients: ','\n', round(coefficient['Coefficient'],4))

print('Intercept: ', round(model2.intercept_,4))
print('Mean absolute error: ',round(mean_absolute_error(y_test, y_pred),4))
Coefficients:  
 y_0   -0.0010
y_1    0.0001
y_2   -0.0005
y_3   -0.0002
y_4    0.0007
y_5    0.0010
Name: Coefficient, dtype: float64
Intercept:  -0.01
Mean absolute error:  0.0796

Model3 - Random Forest Regression X

In [25]:
data.columns # splitting data into train and test set
df = data[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10',
       'X_11', 'X_12', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_28', 'X_29', 'y']]

from sklearn.model_selection import train_test_split
X = df[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_7', 'X_8', 'X_9', 'X_10',
       'X_11', 'X_12', 'X_15', 'X_16', 'X_17', 'X_18', 'X_19', 'X_28', 'X_29']]
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1001)
In [26]:
from sklearn.ensemble import RandomForestRegressor
model3 = RandomForestRegressor(n_estimators=500, random_state=1001) # random forest regression model
model3.fit(X_train, y_train)

y_pred = model3.predict(X_test)

outcome = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
outcome['difference'] = outcome['y_test'] - outcome['y_pred']
outcome['difference_percentage'] = round(outcome.difference/(outcome.y_test/100),6)
print('Percentage difference: ', round(outcome.difference_percentage.abs().mean(),2),'%')

print('Mean absolute error: ',round(mean_absolute_error(y_test, y_pred),4))
Percentage difference:  1.82 %
Mean absolute error:  1.0087

Conclusion

The random forest model performed the best out of the models that I tried: decision tree regression, multiple linear regression, SVM and ANN. The random forest does not take into consideration outliers and even in comparison to the linear regression model where is high linear dependency, performs very well.

The Linear regression model of y error shows a really low mean absolute error because we try to estimate the error out of the six measurements. One can see high linear dependency, which resulted in a choice of a multiple linear regression model.