-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyze.py
237 lines (200 loc) · 9.32 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
# Load the dataset
df = pd.read_csv("Crop_recommendation.csv")
# Filter numeric columns for selection
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
# Sample a subset of rows (e.g., 100 rows)
subset_size = 100
df_sampled = df.head(subset_size)
def show_analyze():
st.title("Data Analysis and Visualization")
st.subheader("Dataset Overview")
st.write(df_sampled.head())
# User selects columns for analysis
st.subheader("Select Columns for Cleaning")
selected_columns = st.multiselect("Select Columns to Include in Cleaned Data:", df.columns)
if selected_columns:
cleaned_df = df_sampled[selected_columns].dropna()
st.write(cleaned_df.head())
# Allow the user to download the cleaned CSV
csv_buffer = io.StringIO()
cleaned_df.to_csv(csv_buffer, index=False)
st.download_button(label="Download Cleaned Data as CSV", data=csv_buffer.getvalue(), file_name="cleaned_crop_data.csv", mime="text/csv")
# Feature Distributions
st.subheader("Feature Distributions")
feature = st.selectbox("Select a feature to visualize", numeric_columns)
if feature:
# Create a histogram for the selected feature
fig, ax = plt.subplots()
df_sampled[feature].hist(ax=ax, bins=20, color='skyblue', edgecolor='black')
ax.set_title(f'Distribution of {feature} (Sampled)')
ax.set_xlabel(feature)
ax.set_ylabel('Frequency')
st.pyplot(fig)
# Correlation Matrix
st.subheader("Correlation Matrix")
numeric_df = df_sampled[numeric_columns]
corr = numeric_df.corr()
st.write(corr)
# Create a correlation matrix plot
fig, ax = plt.subplots()
cax = ax.matshow(corr, cmap='coolwarm')
fig.colorbar(cax)
ax.set_xticks(range(len(numeric_columns)))
ax.set_yticks(range(len(numeric_columns)))
ax.set_xticklabels(numeric_columns, rotation=90)
ax.set_yticklabels(numeric_columns)
ax.set_title('Correlation Matrix (Sampled)')
st.pyplot(fig)
# More advanced analysis
st.subheader("More Advanced Analysis")
# 1. Box Plot
box_feature = st.selectbox("Select a feature for box plot", numeric_columns)
if box_feature:
fig, ax = plt.subplots()
sns.boxplot(x=df_sampled[box_feature], ax=ax, color='lightblue')
ax.set_title(f'Box Plot of {box_feature} (Sampled)')
st.pyplot(fig)
# 2. Scatter Plot
x_axis = st.selectbox("Select X-axis feature for scatter plot", numeric_columns)
y_axis = st.selectbox("Select Y-axis feature for scatter plot", numeric_columns)
if x_axis and y_axis:
fig, ax = plt.subplots()
ax.scatter(df_sampled[x_axis], df_sampled[y_axis], alpha=0.5, color='#417f85')
ax.set_xlabel(x_axis)
ax.set_ylabel(y_axis)
ax.set_title(f'Scatter Plot of {x_axis} vs {y_axis} (Sampled)')
st.pyplot(fig)
# 3. KDE Plot
kde_feature = st.selectbox("Select a feature for KDE plot", numeric_columns)
if kde_feature:
fig, ax = plt.subplots()
sns.kdeplot(df_sampled[kde_feature], ax=ax, fill=True, color='purple')
ax.set_title(f'KDE Plot of {kde_feature} (Sampled)')
ax.set_xlabel(kde_feature)
ax.set_ylabel('Density')
st.pyplot(fig)
# 4. Heatmap of Feature Distributions
st.subheader("Heatmap of Feature Distributions")
heatmap_feature = st.multiselect("Select features for heatmap (numeric only)", numeric_columns)
if heatmap_feature:
fig, ax = plt.subplots()
sns.heatmap(df_sampled[heatmap_feature].corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
ax.set_title(f'Heatmap of Selected Features (Sampled)')
st.pyplot(fig)
# 5. Violin Plot
violin_feature = st.selectbox("Select a feature for Violin plot", numeric_columns)
if violin_feature:
fig, ax = plt.subplots()
sns.violinplot(x=df_sampled[violin_feature], ax=ax, color='orange')
ax.set_title(f'Violin Plot of {violin_feature} (Sampled)')
st.pyplot(fig)
# 6. Histogram with Density Plot Overlay
hist_feature = st.selectbox("Select a feature for Histogram with Density Overlay", numeric_columns)
if hist_feature:
fig, ax = plt.subplots()
sns.histplot(df_sampled[hist_feature], kde=True, ax=ax, color='steelblue')
ax.set_title(f'Histogram with Density Plot of {hist_feature} (Sampled)')
ax.set_xlabel(hist_feature)
ax.set_ylabel('Frequency')
st.pyplot(fig)
# 7. Pair Plot
st.subheader("Pair Plot of Selected Features")
selected_pairplot_features = st.multiselect("Select features for pair plot", numeric_columns)
if selected_pairplot_features:
fig = sns.pairplot(df_sampled[selected_pairplot_features])
st.pyplot(fig)
# 8. Joint Plot
joint_x = st.selectbox("Select X for Joint Plot", numeric_columns)
joint_y = st.selectbox("Select Y for Joint Plot", numeric_columns, index=1)
if joint_x and joint_y:
fig = sns.jointplot(x=joint_x, y=joint_y, data=df_sampled, kind="scatter", color='green')
st.pyplot(fig)
# 9. Line Plot
line_feature = st.selectbox("Select a feature for Line plot", numeric_columns)
if line_feature:
fig, ax = plt.subplots()
df_sampled[line_feature].plot(ax=ax, color='red')
ax.set_title(f'Line Plot of {line_feature} (Sampled)')
ax.set_xlabel('Index')
ax.set_ylabel(line_feature)
st.pyplot(fig)
# 10. Swarm Plot
swarm_feature = st.selectbox("Select a feature for Swarm plot", numeric_columns)
if swarm_feature:
fig, ax = plt.subplots()
sns.swarmplot(x=df_sampled[swarm_feature], ax=ax, color='blue')
ax.set_title(f'Swarm Plot of {swarm_feature} (Sampled)')
st.pyplot(fig)
# 11. Strip Plot
strip_feature = st.selectbox("Select a feature for Strip plot", numeric_columns)
if strip_feature:
fig, ax = plt.subplots()
sns.stripplot(x=df_sampled[strip_feature], ax=ax, color='brown')
ax.set_title(f'Strip Plot of {strip_feature} (Sampled)')
st.pyplot(fig)
# 12. Bar Plot
bar_feature = st.selectbox("Select a feature for Bar plot", numeric_columns)
if bar_feature:
fig, ax = plt.subplots()
df_sampled[bar_feature].value_counts().plot(kind='bar', ax=ax, color='blue')
ax.set_title(f'Bar Plot of {bar_feature} (Sampled)')
st.pyplot(fig)
# 13. Rug Plot
rug_feature = st.selectbox("Select a feature for Rug plot", numeric_columns)
if rug_feature:
fig, ax = plt.subplots()
sns.rugplot(x=df_sampled[rug_feature], ax=ax, color='green')
ax.set_title(f'Rug Plot of {rug_feature} (Sampled)')
st.pyplot(fig)
# 14. Point Plot
point_feature = st.selectbox("Select a feature for Point plot", numeric_columns)
if point_feature:
# Increase image size (figsize) and adjust font sizes
fig, ax = plt.subplots(figsize=(10, 6)) # Increase figure size here
sns.pointplot(x=df_sampled.index, y=point_feature, data=df_sampled, ax=ax, color='purple')
# Adjust title, xlabel, and ylabel font sizes
ax.set_title(f'Point Plot of {point_feature} (Sampled)', fontsize=14) # Font size for title
ax.set_xlabel('Index', fontsize=12) # Font size for x-axis label
ax.set_ylabel(point_feature, fontsize=12) # Font size for y-axis label
# Adjust the tick parameters
ax.tick_params(axis='x', labelsize=10) # Font size of x-axis labels
ax.tick_params(axis='y', labelsize=10) # Font size of y-axis labels
# Display the plot
st.pyplot(fig)
# 15. Density Plot
density_feature = st.selectbox("Select a feature for Density plot", numeric_columns)
if density_feature:
fig, ax = plt.subplots()
sns.kdeplot(df_sampled[density_feature], ax=ax, color='cyan')
ax.set_title(f'Density Plot of {density_feature} (Sampled)')
st.pyplot(fig)
# 16. Count Plot
count_feature = st.selectbox("Select a feature for Count plot", numeric_columns)
if count_feature:
# Increase image size (figsize) and decrease font sizes
fig, ax = plt.subplots(figsize=(10, 6)) # Increase figure size here
sns.countplot(x=df_sampled[count_feature], ax=ax, palette='pastel')
# Adjust title, xlabel, and ylabel font sizes
ax.set_title(f'Count Plot of {count_feature} (Sampled)', fontsize=14) # Decreased font size
ax.set_xlabel(count_feature, fontsize=12) # Decreased font size
ax.set_ylabel('Count', fontsize=12) # Decreased font size
# Adjust the tick parameters
ax.tick_params(axis='x', labelsize=10) # Font size of x-axis labels
ax.tick_params(axis='y', labelsize=10) # Font size of y-axis labels
# Display the plot
st.pyplot(fig)
# Customizing plot appearance
st.markdown("""
<style>
.main .block-container { padding: 1rem; }
h1, h2, h3 { color: white; }
.css-1aumxhk { background-color: #111; }
.stButton button { background-color: #4CAF50; color: white; }
</style>
""", unsafe_allow_html=True)
show_analyze()