Spaces:
Sleeping
Sleeping
| from flask import Flask, render_template, request | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import re | |
| import pickle | |
| from Sastrawi.Stemmer.StemmerFactory import StemmerFactory | |
| from nltk.corpus import stopwords | |
| import nltk | |
| # Inisialisasi aplikasi Flask | |
| app = Flask(__name__) | |
| # Memuat model klasifikasi | |
| lr_model = pickle.load(open('model/lr_modelNormal.pkl', 'rb')) | |
| tfidf_model = pickle.load(open('model/tfidf_modelLatest.pkl', 'rb')) | |
| # Mengunduh data NLTK yang diperlukan | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| # Enkode label untuk kategori | |
| labels_encode = {1: "Research", 0: "News"} | |
| # stop_words = set(stopwords.words('indonesian')) | |
| stop_words = stopwords.words('indonesian') | |
| # save stopwords | |
| with open('stopwords.txt', 'w') as f: | |
| for item in stop_words: | |
| f.write("%s\n" % item) | |
| # Fungsi untuk mengambil konten berita dari URL | |
| def scrape_news(url): | |
| isi = [] | |
| judul = [] | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| article_full = BeautifulSoup(response.content, "html.parser") | |
| judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold").text.strip() | |
| artikel_element = article_full.find("div", class_="detail-text") | |
| artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")] | |
| artikel_content = "\n".join(artikel_teks) | |
| isi.append(artikel_content) | |
| judul.append(judul_artikel) | |
| return pd.DataFrame({"judul": judul, "isi": isi}) | |
| # Fungsi pembersihan teks | |
| def cleansing(text): | |
| text = re.sub(r'[\s]+', ' ', text) | |
| text = text.encode('ascii', 'ignore').decode('utf-8') | |
| text = re.sub(r'[^\x00-\x7f]', r'', text) | |
| text = re.sub(r'\d+', '', text) | |
| text = text.lower() | |
| text = re.sub(r'\b-\b', ' ', text) | |
| text = re.sub(r'[^\w\s]+', ' ', text) | |
| text = text.replace('\n', '') | |
| return text | |
| # Fungsi untuk menghapus stopword | |
| def remove_stopwords(text): | |
| words = text.split() | |
| words = [word for word in words if word not in stop_words] | |
| return ' '.join(words) | |
| # Fungsi stemming | |
| def stemming(text): | |
| factory = StemmerFactory() | |
| stemmer = factory.create_stemmer() | |
| return stemmer.stem(text) | |
| # Fungsi utama untuk preprocessing teks | |
| def preprocess_text(text): | |
| clean_text = cleansing(text) | |
| stopword_text = remove_stopwords(clean_text) | |
| return stemming(stopword_text) | |
| # Fungsi untuk mengklasifikasikan teks | |
| def classify_news(text): | |
| processed_text = preprocess_text(text) | |
| text_vectorized = tfidf_model.transform([processed_text]) | |
| prediction = lr_model.predict(text_vectorized) | |
| prediction_proba = lr_model.predict_proba(text_vectorized) | |
| return prediction[0], prediction_proba[0] | |
| # Fungsi untuk mengklasifikasikan teks dengan model yang berbeda | |
| def classify_news_with_model(text, model): | |
| processed_text = preprocess_text(text) | |
| text_vectorized = tfidf_model.transform([processed_text]) | |
| prediction = model.predict(text_vectorized) | |
| prediction_proba = model.predict_proba(text_vectorized) | |
| # Mengembalikan kategori, probabilitas berita, dan probabilitas penelitian | |
| return prediction[0], prediction_proba[0] # prediction[0] untuk kategori, prediction_proba[0] untuk probabilitas | |
| # Rute untuk halaman utama | |
| def index(): | |
| if request.method == 'POST': | |
| link_news = request.form.get("link_news") | |
| selected_model = request.form.get("model") | |
| # Validasi input | |
| if not link_news: | |
| return render_template('index.html', error="Link tidak boleh kosong.") | |
| if "cnbcindonesia" not in link_news: | |
| return render_template('index.html', error="Link tidak valid. Pastikan link berita dari CNBC Indonesia.") | |
| # Mengambil konten berita dari URL yang diberikan | |
| news = scrape_news(link_news) | |
| news['cleaned_text'] = news["isi"].apply(preprocess_text) | |
| # Melakukan klasifikasi dengan model yang dipilih | |
| if selected_model == "logistic_regression": | |
| prediction, probabilities = classify_news(news['cleaned_text'][0]) | |
| category_name = labels_encode[prediction] | |
| prob_news_percent = round(probabilities[0] * 100, 3) | |
| prob_research_percent = round(probabilities[1] * 100, 3) | |
| elif selected_model == "lr_modelNcompo5": | |
| # Memuat pipeline untuk 5 komponen | |
| with open('model_pipeline_5.pkl', 'rb') as f: | |
| pipeline_5 = pickle.load(f) | |
| # Transformasi menggunakan model TF-IDF yang dimuat | |
| X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]]) | |
| prediction = pipeline_5.predict(X_new_tfidf) | |
| probabilities = pipeline_5.predict_proba(X_new_tfidf) | |
| category_name = labels_encode[prediction[0]] | |
| prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita | |
| prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe | |
| elif selected_model == "lr_modelNcompo10": | |
| # Memuat pipeline untuk 10 komponen | |
| with open('model_pipeline_10.pkl', 'rb') as f: | |
| pipeline_10 = pickle.load(f) | |
| # Transformasi menggunakan model TF-IDF yang dimuat | |
| X_new_tfidf = tfidf_model.transform([news['cleaned_text'][0]]) | |
| prediction = pipeline_10.predict(X_new_tfidf) | |
| probabilities = pipeline_10.predict_proba(X_new_tfidf) | |
| category_name = labels_encode[prediction[0]] | |
| prob_news_percent = round(probabilities[0][0] * 100, 3) # Akses probabilitas untuk kelas berita | |
| prob_research_percent = round(probabilities[0][1] * 100, 3) # Akses probabilitas untuk kelas pe | |
| # Membulatkan probabilitas dan mengubah ke persen | |
| return render_template('index.html', result=category_name, prob_news=prob_news_percent, prob_research=prob_research_percent) | |
| return render_template('index.html') | |
| if __name__ == '__main__': | |
| app.run(debug=True, port=5001) | |