Fake News?

Fake News?

In this blog post, I will develop and asses a fake news classifier

Making a data set

After reading in the data, I created a function that removes stop words (as, the, of, etc.) using nltk.corpus. The function also returns a dataset with title and text of an article as the inputs and fake as the output.

from nltk.corpus import stopwords
stop = stopwords.words('english')
def make_dataset(inpdata):
    

    inpdata["title"] = inpdata["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    inpdata['text'] = inpdata['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
    data = tf.data.Dataset.from_tensor_slices(
        ( # dictionary for input data/features
            { "title": inpdata[["title"]],
             "text": inpdata[["text"]]
            },
            # dictionary for output data/labels
            { "fake": inpdata["fake"] #single brackets will return one column, two brackets will return a full df
            }   
        ) 
    )
    return data.batch(100)

Validation

Then I split 20% of the dataset I made to use for validation.

mydata = make_dataset(mydata)

mydata = mydata.shuffle(buffer_size = len(mydata))

train_size = int(0.8*len(mydata)) 
val_size   = int(0.2*len(mydata)) #20% validation

train = mydata.take(train_size)
val = mydata.skip(train_size).take(val_size)

Base Rate

In order to determine the base rate, I start by creating an iterator to evaluate the labels on the training data

labels_iterator= train.unbatch().map(lambda dict_title_text, label: label).as_numpy_iterator()

Then I create two integers. Depending on value in the label, fake, one of these integer variables will increase. This keeps track of how many fake and real articles there are. This will be used to determine the accuracy of the model (the base rate).

first_num = 0 #create an int
sec_num = 0 #create an int
for i in labels_iterator:
    if i["fake"]==0: #if its not fake, increase the value of the first int
        first_num+=1 
    else: #if its fake, increase the value of the second int
        sec_num+=1

Text Vectorization

#preparing a text vectorization layer for tf model
size_vocabulary = 2000

def standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    no_punctuation = tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation),'')
    return no_punctuation 

title_vectorize_layer = TextVectorization(
    standardize=standardization,
    max_tokens=size_vocabulary, # only consider this many words
    output_mode='int',
    output_sequence_length=500) 

title_vectorize_layer.adapt(train.map(lambda x, y: x["title"]))

First Model - Using article titles to detect fake news

To start my first model, I defined the inputs. This model will just take the article title as input.

# inputs
titles_input = keras.Input(
    shape = (1,), 
    name = "title",
    dtype = "string"
)

Next, I created the layers needed for processing the titles

titles_features = title_vectorize_layer(titles_input)
## Add embedding layer , dropout ...

titles_features = title_vectorize_layer(titles_input) # apply this "function TextVectorization layer" to lyrics_input
titles_features = layers.Embedding(size_vocabulary, output_dim = 3)(titles_features)
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.GlobalAveragePooling1D()(titles_features)
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.Dense(32, activation='relu')(titles_features)

I created an output layer

output = layers.Dense(2, name = "fake")(titles_features) 

Defined the model

model1 = keras.Model(
    inputs = titles_input,
    outputs = output
) 

Visualized the model

keras.utils.plot_model(model1)

Compile the model

model1.compile(optimizer="adam",
              loss = losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

Fit the model

history = model1.fit(train, 
                    validation_data=val,
                    epochs = 20, 
                    verbose = False)

Create a plot to visualize the model’s accuracy

from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])

Model 2 - Using article text to detect fake news

I follow the same steps as the first model, but I replace titles with text

Text Vectorization

text_vectorize_layer = TextVectorization(
    standardize=standardization,
    max_tokens=size_vocabulary, # only consider this many words
    output_mode='int',
    output_sequence_length=500) 

text_vectorize_layer.adapt(train.map(lambda x, y: x["text"]))

Inputs

text_input = keras.Input(
    shape = (1,), 
    name = "text",
    dtype = "string"
)

Layers for processing the texts

text_features = text_vectorize_layer(text_input)
## Add embedding layer , dropout ...

text_features = text_vectorize_layer(text_input) # apply this "function TextVectorization layer" to text_input
text_features = layers.Embedding(size_vocabulary, output_dim = 3)(text_features)
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.GlobalAveragePooling1D()(text_features)
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.Dense(32, activation='relu')(text_features)

Output layer

output = layers.Dense(2, name = "fake")(text_features) 

Create the model

model2 = keras.Model(
    inputs = text_input,
    outputs = output
) 

Visualize the model

keras.utils.plot_model(model2)

Compile the model

model2.compile(optimizer="adam",
              loss = losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

Fit the model

history = model2.fit(train, 
                    validation_data=val,
                    epochs = 20, 
                    verbose = False)

Create a plot of the model’s accuracy

from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])

Model 3 - Using article titles and text to detect fake news

Concatenate the layers fromthe titles and texts models

main = layers.concatenate([titles_features, text_features], axis = 1)

Create the output layer

main = layers.Dense(32, activation='relu')(main)
output = layers.Dense(2, name="fake")(main) 
#should always have the number of classes in final layer/output
#2 because fake and not fake

Create the model

model3 = keras.Model(
    inputs = [titles_input, text_input],
    outputs = output
)

Visualize the model

keras.utils.plot_model(model3)

Compile the model

model3.compile(optimizer="adam",
              loss = losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])

Fit the model

history = model3.fit(train, 
                    validation_data=val,
                    epochs = 20, 
                    verbose = False)

Create a plot to visualize the model’s accuracy

from matplotlib import pyplot as plt
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])