-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
251 lines (202 loc) · 8.8 KB
/
Copy pathmain.py
File metadata and controls
251 lines (202 loc) · 8.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python3
"""
🚀 FAKE NEWS DETECTION - DAY 1 COMPLETE VERSION
All functionality working with multiple models and predictions
"""
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
def create_sample_data():
"""Create sample fake news data for immediate testing"""
print("📊 Loading dataset...")
# Sample real news headlines/text
real_news = [
"Scientists discover new planet in nearby solar system using advanced telescopes",
"Local mayor announces new infrastructure project for downtown area",
"Stock market shows steady growth this quarter according to financial reports",
"New medical breakthrough helps treat rare disease in clinical trials",
"University researchers publish climate change study in scientific journal",
"Technology company releases quarterly earnings report showing profits",
"International trade agreement signed between two countries this week",
"Archaeological team uncovers ancient artifacts in historical excavation site",
"Sports team wins championship in overtime victory against rivals",
"Government announces new education funding initiative for public schools"
]
# Sample fake news headlines/text
fake_news = [
"Aliens land in major city downtown area, government covers up story completely",
"Miracle cure discovered that doctors don't want you to know about",
"Celebrity secretly controls world government from hidden underground base",
"Dangerous vaccines contain mind control microchips according to insider",
"Local politician caught in massive conspiracy scandal involving millions",
"Scientists hide evidence that earth is actually flat, whistleblower reveals",
"Secret society plans to control all social media platforms next month",
"Ancient prophecy predicts end of world next month according to expert",
"Government uses weather machines to control natural disasters and storms",
"Billionaire entrepreneur is actually time traveling alien from future"
]
# Create DataFrame
texts = real_news + fake_news
labels = [0] * len(real_news) + [1] * len(fake_news) # 0 = Real, 1 = Fake
df = pd.DataFrame({
'text': texts,
'label': labels
})
print(f"Dataset loaded: {len(df)} articles")
print(f"Real news: {len(real_news)}")
print(f"Fake news: {len(fake_news)}")
return df
def preprocess_text(text):
"""Basic text preprocessing"""
if pd.isna(text):
return ""
# Convert to lowercase and basic cleaning
text = str(text).lower()
# Remove extra whitespace
text = ' '.join(text.split())
return text
def train_models(X_train, X_test, y_train, y_test):
"""Train multiple models and compare results"""
print("Training models...")
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
'Naive Bayes': MultinomialNB(),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'SVM': SVC(kernel='linear', probability=True, random_state=42)
}
results = {}
for name, model in models.items():
# Train model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
results[name] = {
'model': model,
'accuracy': accuracy,
'predictions': y_pred
}
print(f"{name} Accuracy: {accuracy:.2f}")
return results
def predict_article(model, vectorizer, article_text):
"""Predict if an article is fake or real"""
# Preprocess the text
processed_text = preprocess_text(article_text)
# Vectorize the text
text_vector = vectorizer.transform([processed_text])
# Make prediction
prediction = model.predict(text_vector)[0]
probability = model.predict_proba(text_vector)[0]
# Get confidence
confidence = max(probability) * 100
# Return result
label = "FAKE" if prediction == 1 else "REAL"
return label, confidence
def create_visualizations(results, y_test):
"""Create basic visualizations"""
print("\n📊 Creating visualizations...")
# Model comparison
plt.figure(figsize=(12, 5))
# Subplot 1: Model Accuracies
plt.subplot(1, 2, 1)
names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in names]
bars = plt.bar(names, accuracies, color=['skyblue', 'lightgreen', 'lightcoral', 'lightsalmon'])
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
# Add accuracy values on bars
for bar, acc in zip(bars, accuracies):
plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
f'{acc:.2f}', ha='center', va='bottom')
# Subplot 2: Confusion Matrix for best model
plt.subplot(1, 2, 2)
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
best_predictions = results[best_model_name]['predictions']
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()
def main():
"""Main execution function"""
print("🔍 Fake News Detection System")
print("=" * 40)
# Step 1: Create or load data
df_fake = pd.read_csv("data/Fake.csv")
df_true = pd.read_csv("data/True.csv")
df_fake['label'] = 1 # Fake = 1
df_true['label'] = 0 # Real = 0
df = pd.concat([df_fake, df_true], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Dataset loaded: {len(df)} articles")
print(f"Real news: {sum(df['label'] == 0)}")
print(f"Fake news: {sum(df['label'] == 1)}")
# Step 2: Preprocess text
df['processed_text'] = (df['title'].astype(str) + " " + df['text'].astype(str)).apply(preprocess_text)
# Step 3: Create features using TF-IDF
vectorizer = TfidfVectorizer(
max_features=5000,
stop_words='english',
ngram_range=(1, 2) # use unigrams + bigrams
)
X = vectorizer.fit_transform(df['processed_text'])
y = df['label']
# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)
# Step 5: Train models
results = train_models(X_train, X_test, y_train, y_test)
# Step 6: Find best model
# Use Logistic Regression for better generalization
best_model_name = 'Logistic Regression'
best_model = results[best_model_name]['model']
best_accuracy = results[best_model_name]['accuracy']
# Step 7: Test with custom articles
print("=" * 50)
print("Testing with custom articles:")
print("=" * 50)
test_articles = [
"The economy grew by 3% last quarter, according to government data.",
"Doctors warn about fake COVID-19 cures spreading on social media.",
"BREAKING: NASA confirms aliens are already living on Mars!",
"Miracle plant can cure cancer in 7 days — experts shocked!",
"Government launches new scheme to provide free education to all."
]
for i, article in enumerate(test_articles, 1):
label, confidence = predict_article(best_model, vectorizer, article)
print(f"Test {i}: {article}")
print(f"Prediction: {label} ({confidence:.1f}% confidence)")
print()
# Step 8: Create visualizations
create_visualizations(results, y_test)
# Step 9: Summary
print("🎉 DAY 1 COMPLETE!")
print("✅ Sample dataset created")
print("✅ Multiple models trained")
print("✅ Predictions working")
print("✅ Visualizations created")
print(f"✅ Best accuracy: {best_accuracy:.2f}")
print("\n🚀 Ready for Day 2: Real dataset integration!")
if __name__ == "__main__":
main()
import joblib
joblib.dump(best_model, f"models/{best_model_name.replace(' ', '_')}.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
print(f"💾 Saved model and vectorizer to 'models/'")