covid_19_Analysis.py

# -*- coding: utf-8 -*-
"""COVID-19_Project.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1cfPZNewSc7XqfctPaLQ6x67LY33sjKc_
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn

df= pd.read_csv('covid_19_clean_complete.csv')
# merging them into one single dataset

df.head(200)

df.dtypes

df.shape

df.info()

df.describe()

df.isnull().sum()

df.drop(['Province/State'] , axis = 1 , inplace = True)

df.drop_duplicates(inplace=True)


df['Recovered'] =df['Recovered'].fillna(df['Recovered'].mean())
df['Active'] =df['Active'].fillna(df['Active'].mean())
df['Deaths'] =df['Deaths'].fillna(df['Deaths'].mean())

df.nunique()

49068/188
df.head(262)


df.isnull().sum()

df.head(20)

# plt.figure(figsize=(50,50))
print(df.groupby('Date')['Deaths'].sum().sort_values(ascending=False))
print(df.groupby('Date')['Recovered'].sum().sort_values(ascending=False))
print(df.groupby('Date')['Confirmed'].sum().sort_values(ascending=False))

df['Active'] = df['Confirmed'] - df['Deaths'] - df['Recovered']

df.head()

# plotting pie plot of countryRegion on the basis of Number of Active patients
plt.figure(figsize=(80,30))
 df.groupby('Country/Region')['Recovered'].max().plot(kind='barh', color = 'r')
df.groupby('Country/Region')['Recovered'].mean().plot(kind='barh' , color = 'y')
 df.groupby('Country/Region')['Recovered'].min().plot(kind='barh' , color = 'g')
 plt.xlabel('Active')
 plt.ylabel('Country_Region')
 plt.title('Active')
plt.show()

df.groupby('Country/Region')['Recovered'].sum().sort_values(ascending=False)

df.groupby('Country/Region')['Recovered'].mean().sort_values(ascending=False)

df.groupby('Date')['Deaths'].sum()

df.groupby('Country/Region')['Recovered'].min()

plt.figure(figsize=(50,80))
df.groupby('Country/Region')['Confirmed'].mean().plot.barh()

df.groupby('Country/Region')['Deaths'].mean().sort_values(ascending=False)

plt.scatter(df['Confirmed'],df['Deaths'] , color = 'red' , marker ='*')
plt.scatter(df['Confirmed'],df['Recovered'] , color ='g' , marker ='^')
plt.xlabel('Confirmed')
plt.ylabel('No . of People')
plt.show()

# Bivariate Analysis
# seaborn.pairplot(df)

df.head(6)

df = pd.get_dummies(df, columns=['Country/Region', 'WHO Region'], drop_first=True)

df.head()

# Modeling a linear regression model
X = df.drop(['Deaths','Date'] , axis = 1)
y = df[['Deaths']]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
print("Accuracy of train data is ; " , regressor.score(X_train,y_train))
print("Accuracy of test data is ; " , regressor.score(X_test,y_test))
# mean squared Error
from sklearn.metrics import mean_squared_error
print("Mean Squared Error is : " , mean_squared_error(y_test,y_pred))

# prediction
# print(regressor.predict([[]]))

dfm = df.copy()
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA

dfm['Date'] = pd.to_datetime(dfm['Date'])
dfm.set_index('Date',inplace=True)
monthly_deaths = dfm['Deaths'].resample('M').sum()
monthly_deaths.head()
result = adfuller(monthly_deaths.dropna())
print("ADF Statistic:" ,result[0])
print("p-value:",result[1])
if result[1] > 0.05:
    monthly_deaths_diff = monthly_deaths.diff().dropna()
else:
    monthly_deaths_diff = monthly_deaths
#Fit the ARIMA Model
model  = ARIMA(monthly_deaths_diff,order=(1,1,1))
model_fit = model.fit()
print(model_fit.summary())
# Forecast future values
forecast = model_fit.forecast(steps=20)
print(forecast)

df_diff = dfm.diff().dropna()

plt.figure(figsize=(10, 6))
plt.plot(df_diff.index, df_diff['Deaths'], marker='o')
plt.title('Differenced Total Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Differenced Total Deaths')
plt.grid(True)
plt.show()

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# ACF plot
plot_acf(df_diff['Deaths'])
plt.show()

# PACF plot
plot_pacf(df_diff['Deaths'])
plt.show()

from statsmodels.tsa.arima.model import ARIMA
df_grouped = df.groupby('Date')['Deaths'].sum().reset_index()
# Define the model
df_grouped['Date'] = pd.to_datetime(df_grouped['Date'])
model = ARIMA(df_grouped["Deaths"], order=(2, 3, 2))
# Fit the model
model_fit = model.fit()

# Summary of the model
print(model_fit.summary())

# Forecast the future values
forecast = model_fit.forecast(steps=30)  # Predict the next 30 days

# Plot the actual and forecasted values
plt.figure(figsize=(10, 6))
plt.plot(dfm.index, dfm['Deaths'], label='Actual')
plt.plot(pd.date_range(start=dfm.index[-1], periods=30, freq='D'), forecast, label='Forecast', color='red')
plt.title('COVID-19 Deaths Forecast')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)
plt.show()

from prophet import Prophet

# Prepare data for Prophet
df_prophet = df.rename(columns={'Date': 'ds', 'Deaths': 'y'})

# Initialize the model
model = Prophet()

# Fit the model
model.fit(df_prophet)

# Make future predictions
future = model.make_future_dataframe(periods=30)  # Predicting 30 days into the future
forecast = model.predict(future)

# Plot the forecast
fig = model.plot(forecast)
plt.title('COVID-19 Deaths Forecast')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.show()

# Plot the forecast components
fig2 = model.plot_components(forecast)
plt.show()

# Forecast the future values
forecast = model_fit.forecast(steps=30)  # Predict the next 30 days

# Plot the actual and forecasted values
plt.figure(figsize=(10, 6))
plt.plot(df_grouped.index, df_grouped['Deaths'], label='Actual')
plt.plot(pd.date_range(start=df_grouped.index[-1], periods=30, freq='D'), forecast, label='Forecast', color='red')
plt.title('COVID-19 Deaths Forecast')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)
plt.show()

from sklearn.metrics import mean_squared_error

# Split the data into training and test sets
train_size = int(len(df_grouped) * 0.8)
train, test = df_grouped.iloc[:train_size], df_grouped.iloc[train_size:]

# Fit the model on the training data
model = ARIMA(train['Deaths'], order=(1, 1, 1))
model_fit = model.fit()

# Make predictions on the test data
test_predictions = model_fit.forecast(steps=len(test))

# Calculate RMSE
rmse = mean_squared_error(test['Deaths'], test_predictions, squared=False)
print(f'Root Mean Squared Error: {rmse}')

# Plot the actual and predicted values
plt.figure(figsize=(10, 6))
plt.plot(train.index, train['Deaths'], label='Training')
plt.plot(test.index, test['Deaths'], label='Actual')
plt.plot(test.index, test_predictions, label='Predicted', color='red')
plt.title('COVID-19 Deaths Prediction')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)
plt.show()