Fake News?
In this blog post, I will develop and asses a fake news classifier
Making a data set
After reading in the data, I created a function that removes stop words (as, the, of, etc.) using nltk.corpus. The function also returns a dataset with title and text of an article as the inputs and fake as the output.
from nltk.corpus import stopwords
stop = stopwords.words('english')
def make_dataset(inpdata):
inpdata["title"] = inpdata["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
inpdata['text'] = inpdata['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
data = tf.data.Dataset.from_tensor_slices(
( # dictionary for input data/features
{ "title": inpdata[["title"]],
"text": inpdata[["text"]]
},
# dictionary for output data/labels
{ "fake": inpdata["fake"] #single brackets will return one column, two brackets will return a full df
}
)
)
return data.batch(100)
Validation
Then I split 20% of the dataset I made to use for validation.
mydata = make_dataset(mydata)
mydata = mydata.shuffle(buffer_size = len(mydata))
train_size = int(0.8*len(mydata))
val_size = int(0.2*len(mydata)) #20% validation
train = mydata.take(train_size)
val = mydata.skip(train_size).take(val_size)
Base Rate
In order to determine the base rate, I start by creating an iterator to evaluate the labels on the training data
labels_iterator= train.unbatch().map(lambda dict_title_text, label: label).as_numpy_iterator()
Then I create two integers. Depending on value in the label, fake, one of these integer variables will increase. This keeps track of how many fake and real articles there are. This will be used to determine the accuracy of the model (the base rate).
first_num = 0 #create an int
sec_num = 0 #create an int
for i in labels_iterator:
if i["fake"]==0: #if its not fake, increase the value of the first int
first_num+=1
else: #if its fake, increase the value of the second int
sec_num+=1
Text Vectorization
#preparing a text vectorization layer for tf model
size_vocabulary = 2000
def standardization(input_data):
lowercase = tf.strings.lower(input_data)
no_punctuation = tf.strings.regex_replace(lowercase,
'[%s]' % re.escape(string.punctuation),'')
return no_punctuation
title_vectorize_layer = TextVectorization(
standardize=standardization,
max_tokens=size_vocabulary, # only consider this many words
output_mode='int',
output_sequence_length=500)
title_vectorize_layer.adapt(train.map(lambda x, y: x["title"]))
First Model - Using article titles to detect fake news
To start my first model, I defined the inputs. This model will just take the article title as input.
# inputs
titles_input = keras.Input(
shape = (1,),
name = "title",
dtype = "string"
)
Next, I created the layers needed for processing the titles
titles_features = title_vectorize_layer(titles_input)
## Add embedding layer , dropout ...
titles_features = title_vectorize_layer(titles_input) # apply this "function TextVectorization layer" to lyrics_input
titles_features = layers.Embedding(size_vocabulary, output_dim = 3)(titles_features)
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.GlobalAveragePooling1D()(titles_features)
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.Dense(32, activation='relu')(titles_features)
I created an output layer
output = layers.Dense(2, name = "fake")(titles_features)
Defined the model
model1 = keras.Model(
inputs = titles_input,
outputs = output
)
Visualized the model
keras.utils.plot_model(model1)
Compile the model
model1.compile(optimizer="adam",
loss = losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=["accuracy"])
Fit the model
history = model1.fit(train,
validation_data=val,
epochs = 20,
verbose = False)
Create a plot to visualize the model’s accuracy
from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
Model 2 - Using article text to detect fake news
I follow the same steps as the first model, but I replace titles with text
Text Vectorization
text_vectorize_layer = TextVectorization(
standardize=standardization,
max_tokens=size_vocabulary, # only consider this many words
output_mode='int',
output_sequence_length=500)
text_vectorize_layer.adapt(train.map(lambda x, y: x["text"]))
Inputs
text_input = keras.Input(
shape = (1,),
name = "text",
dtype = "string"
)
Layers for processing the texts
text_features = text_vectorize_layer(text_input)
## Add embedding layer , dropout ...
text_features = text_vectorize_layer(text_input) # apply this "function TextVectorization layer" to text_input
text_features = layers.Embedding(size_vocabulary, output_dim = 3)(text_features)
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.GlobalAveragePooling1D()(text_features)
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.Dense(32, activation='relu')(text_features)
Output layer
output = layers.Dense(2, name = "fake")(text_features)
Create the model
model2 = keras.Model(
inputs = text_input,
outputs = output
)
Visualize the model
keras.utils.plot_model(model2)
Compile the model
model2.compile(optimizer="adam",
loss = losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=["accuracy"])
Fit the model
history = model2.fit(train,
validation_data=val,
epochs = 20,
verbose = False)
Create a plot of the model’s accuracy
from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
Model 3 - Using article titles and text to detect fake news
Concatenate the layers fromthe titles and texts models
main = layers.concatenate([titles_features, text_features], axis = 1)
Create the output layer
main = layers.Dense(32, activation='relu')(main)
output = layers.Dense(2, name="fake")(main)
#should always have the number of classes in final layer/output
#2 because fake and not fake
Create the model
model3 = keras.Model(
inputs = [titles_input, text_input],
outputs = output
)
Visualize the model
keras.utils.plot_model(model3)
Compile the model
model3.compile(optimizer="adam",
loss = losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=["accuracy"])
Fit the model
history = model3.fit(train,
validation_data=val,
epochs = 20,
verbose = False)
Create a plot to visualize the model’s accuracy
from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])