KEMBAR78
Source Code Python Jemmy | PDF | Computer Data | Software Development
0% found this document useful (0 votes)
161 views7 pages

Source Code Python Jemmy

1. The document describes steps to preprocess Twitter sentiment data on Free Fire game reviews in Indonesian, including downloading necessary libraries, importing data, cleaning text by removing stopwords and punctuation, stemming words, and saving the final preprocessed data. 2. Key preprocessing steps include tokenizing text, removing stopwords, normalizing words, and stemming words. The document provides code snippets in Python to implement each step using libraries like NLTK, Pandas, Sastrawi, and NumPy. 3. The final preprocessed data is saved in CSV and Excel formats for further analysis.

Uploaded by

Fadilah Riczky
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
161 views7 pages

Source Code Python Jemmy

1. The document describes steps to preprocess Twitter sentiment data on Free Fire game reviews in Indonesian, including downloading necessary libraries, importing data, cleaning text by removing stopwords and punctuation, stemming words, and saving the final preprocessed data. 2. Key preprocessing steps include tokenizing text, removing stopwords, normalizing words, and stemming words. The document provides code snippets in Python to implement each step using libraries like NLTK, Pandas, Sastrawi, and NumPy. 3. The final preprocessed data is saved in CSV and Excel formats for further analysis.

Uploaded by

Fadilah Riczky
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 7

1.

pip install google-play-scraper

2.

from google_play_scraper import Sort, reviews


result, continuation_token = reviews(
'com.dts.freefireth',
lang='id', # defaults to 'en'
country='id', # defaults to 'us'
sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
count=4000, # defaults to 100

)
result, _ = reviews(
'com.dts.freefireth',
continuation_token=continuation_token # defaults to None(load from the beginning)
)
print(result)

3.

import pandas as pd

df = pd.DataFrame(result)
df.to_csv("D:/TestData11.CSV")

4.
pip install nltk

5.
import nltk
nltk.download()

6.
pip install Sastrawi
7.
pip install numpy

8.
import pandas as pd
import numpy as np

TWEET_DATA = pd.read_csv("D:/data_ff.csv")

TWEET_DATA.head()

9.
TWEET_DATA.to_csv("D:/data_ff.csv")

10.

# ------ Case Folding --------


# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['content'] = TWEET_DATA['content'].str.lower()

print('Case Folding Result : \n')


print(TWEET_DATA['content'].head(5))
print('\n\n\n')

11.

import string
import re #regex library

# import word_tokenize & FreqDist from NLTK


from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
# remove tab, new line, ans back slice
text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
# remove non ASCII (emoticon, chinese word, .etc)
text = text.encode('ascii', 'replace').decode('ascii')
# remove mention, link, hashtag
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
# remove incomplete URL
return text.replace("http://", " ").replace("https://", " ")

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_tweet_special)

#remove punctuation
def remove_punctuation(text):
return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_punctuation)

#remove whitespace leading & trailing


def remove_whitespace_LT(text):
return text.strip()

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace


def remove_whitespace_multiple(text):
return re.sub('\s+',' ',text)

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_multiple)

# remove single char


def remove_singl_char(text):
return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_singl_char)

# NLTK word rokenize


def word_tokenize_wrapper(text):
return word_tokenize(text)

TWEET_DATA['content_tokens'] = TWEET_DATA['content'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n')


print(TWEET_DATA['content_tokens'].head())
print('\n\n\n')
11.

# NLTK calc frequency distribution


def freqDist_wrapper(text):
return FreqDist(text)

TWEET_DATA['content_tokens_fdist'] =
TWEET_DATA['content_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n')


print(TWEET_DATA['content_tokens_fdist'].head().apply(lambda x : x.most_common()))

12.

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------


# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# ---------------------------- manualy add stopword ------------------------------------


# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
'kalo', 'amp', 'biar', 'bikin', 'bilang',
'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
'jd', 'jgn', 'sdh', 'aja', 'n', 't',
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
'&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------


# read txt stopword using pandas
txt_stopword = pd.read_csv("D:/stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword


list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary


list_stopwords = set(list_stopwords)
#remove stopword pada list token
def stopwords_removal(words):
return [word for word in words if word not in list_stopwords]

TWEET_DATA['content_tokens_WSW'] =
TWEET_DATA['content_tokens'].apply(stopwords_removal)

print(TWEET_DATA['content_tokens_WSW'].head())

13.

normalizad_word = pd.read_excel("D:/normalisasi.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():


if row[0] not in normalizad_word_dict:
normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in
document]

TWEET_DATA['content_normalized'] =
TWEET_DATA['content_tokens_WSW'].apply(normalized_term)

TWEET_DATA['content_normalized'].head(10)

14.

conda install -c conda-forge swifter

15.

# import Sastrawi package


from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['content_normalized']:


for term in document:
if term not in term_dict:
term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:


term_dict[term] = stemmed_wrapper(term)
print(term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe


def get_stemmed_term(document):
return [term_dict[term] for term in document]

TWEET_DATA['content_tokens_stemmed'] =
TWEET_DATA['content_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['content_tokens_stemmed'])

16.

TWEET_DATA.to_csv("data_ff.csv")
17.
TWEET_DATA.to_excel("data_ff.xlsx")

You might also like