Machine Learning Projects

                # The following code builds a K-nearest neighbor algorithm which given a sample of wine,
                # classifies the wine into one of three types based on its attributes. The algorithm first
                # pre-processes the data for any empty values. Then the algorithm trains on 70%
                # of the data and uses the other 30% to test for validity/accuracy.
                
                import pandas as pd 
                import numpy as np 
                from scipy.stats import mode
                from sklearn.preprocessing import StandardScaler
                from sklearn.model_selection import train_test_split
                from sklearn.metrics import accuracy_score 

                class KNN():
                    #default constructor
                    def __init__(self, K, distance = 'euclidean',weight = 'uniform'):
                        self.K = K
                        self.distance = distance
                        self.weight = weight
                    
                    #preprocess dataset for missing values and randomize samples if necessary
                    def preprocess(self, dataset):
                        df = pd.DataFrame(dataset)
                        df = df.fillna('null')
                        
                        for col in df.columns:
                            for index in range(len(df)):
                                if df.at[index, col] == 'null':
                                    neighbors = []
                                    
                                    if index < 11:
                                        # Case: Current row is less than 11, take rows below
                                        neighbors = df[col][index+1:index+11].replace('null', np.nan).dropna().values
                                    else:
                                        # Case: Current row is 11 or more, take rows above
                                        neighbors = df[col][index-11:index].replace('null', np.nan).dropna().values
                                    
                                    
                                    if df[col].dtype in [np.int64, np.float64]:
                                        # Numerical column: Fill with average of neighbors
                                        if neighbors:
                                            df.at[index, col] = np.mean(neighbors)
                                    else:
                                        # Categorical column: Fill with mode of neighbors
                                        if neighbors:
                                            df.at[index, col] = mode(neighbors)[0][0]
                        
                        # Shuffle the rows of the dataset before returning
                        df = df.sample(frac=1).reset_index(drop=True)
                        
                        return df
                        
                    # Train dataset
                    def fit(self, train_features, train_labels):
                        # Normalize features for accuracy
                        self.scaler = StandardScaler()
                        self.train_features = self.scaler.fit_transform(train_features)
                        self.train_labels = train_labels
                        
                        #self.samples : the number of samples in the training set
                        #self.features : the number of features in the training set
                        self.samples, self.features = self.train_features.shape
                    
                    # Test dataset
                    def predict(self, test_features):
                        self.test_features = self.scaler.transform(test_features)
                        self.test_samples, self.features = self.test_features.shape
                        
                        
                        # Given a point, compute distane between all neighbors
                        predictions = np.zeros(self.test_samples)
                        
                        for i in range(self.test_samples):
                            
                            sample = self.test_features[i]
                            
                            neighbors_features, neighbors_labels = self.search(sample)
                            
                            if self.weight == 'uniform':
                                unique_labels, counts = np.unique(neighbors_labels, return_counts=True)
                                predictions[i] = unique_labels[np.argmax(counts)]
                            elif self.weight == 'distance':
                                predictions[i] = self.weighted_vote(sample, neighbors_features, neighbors_labels)
                        
                        return predictions

                    # Find neighbors
                    def search(self, sample):
                        distances = np.zeros(self.samples)
                        
                        for i in range(self.samples):
                            distances[i] = self.find_distance(sample, self.train_features[i])
                            
                        index = distances.argsort()[:self.K]
                        # Return both the K nearest neighbors and their labels
                        neighbors_features = self.train_features[index]
                        neighbors_labels = self.train_labels[index]
                        
                        return neighbors_features, neighbors_labels
                    
                    # compute distances
                    def find_distance(self, sample ,train_features):
                        if self.distance == 'euclidean':
                            return np.sqrt(np.sum(np.square(sample - train_features)))
                        elif self.distance == 'manhattan':
                            return np.sum(np.abs(sample - train_features))
                    
                    # If a point is closer than other neighbors, the point has more value
                    def weighted_vote(self, sample, neighbors_features, neighbors_labels):
                        
                        distances = np.array([self.find_distance(sample, neighbor_feature) for neighbor_feature in neighbors_features])
                        
                        # Dictionary to store the sum of weights for each class/label
                        weights = 1 / (distances + 1e-5)
                        weighted_labels = {}
                        
                        for i, label in enumerate(neighbors_labels):
                            if label not in weighted_labels:
                                weighted_labels[label] = 0
                            # Add the weight of the neighbor to its corresponding label
                            weighted_labels[label] += weights[i]
                        
                        # Return the label with the highest total weight
                        return max(weighted_labels, key=weighted_labels.get)


                #FIND DATASET: https://archive.ics.uci.edu/dataset/109/wine

                # Load the dataset into a DataFrame
                dataset = pd.read_csv('wines.txt', header=None)

                # Initialize and preprocess the dataset
                knn = KNN(K=3)
                processed_dataset = knn.preprocess(dataset)

                # Separate features and labels
                X = processed_dataset.iloc[:,1:].values
                Y = processed_dataset.iloc[:,0].values

                # Split into training and test sets
                X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

                # Evaluate KNN algorithm for multiple values of K
                print("\nKNN algorithm using all features:\n")
                for K in range(1, 11):
                    knn = KNN(K=K,weight='distance')
                    knn.fit(X_train, Y_train)
                    predictions = knn.predict(X_test)
                    accuracy = accuracy_score(Y_test, predictions)
                    print(f'Accuracy of the KNN model with K={K}: {accuracy * 100:.2f}%')
            

                # The following code builds a Nueral Network with goal to determine a wine sample's
                # quality based on features such as pH level, density, acidity, sulfur dioxide, and
                # many other features. The output results in 'Bad', 'Good', or 'Great' wine.
                # The network as 2 hidden layers with one input and output layer. A report
                # is also printed on classification resutls.
                
                from sklearn.metrics import classification_report
                from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
                from sklearn.model_selection import train_test_split
                import matplotlib.pyplot as plt
                import keras
                from keras import regularizers
                import pandas as pd
                import tensorflow as tf

                # FIND DATASET : https://archive.ics.uci.edu/dataset/186/wine+quality

                # Extract data from the txt file and split data into features and labels
                # To pre-process, Standardize features and Encode labels since the features
                # are discrete and continuous values ranging from 0 to 2000
                # open the txt with the dataset
                data = pd.read_excel("winequality-white.xlsx",header = 0)

                features = data.iloc[:, 0:-1]
                labels = data.iloc[:,-1]

                #convert labels from discrete to ordinal for a better classification results
                labels = pd.Series(["bad" if elm <= 4 else "okay" if 4 < elm <= 8 else "great" for elm in labels])

                scaler = StandardScaler()
                features = scaler.fit_transform(features)

                encoder = LabelEncoder()
                labels = encoder.fit_transform(labels)
                labelsOnehot = keras.utils.to_categorical(labels,num_classes=3)

                # Split the model into 70/30 and randomize the data since it's organized by labels
                trainFeatures, testFeatures, trainLabels, testLabels =\
                    train_test_split(features, labelsOnehot, train_size=0.7, random_state=42)

                # Build the neural network. The input layer will have 13 nodes since there as 13 features
                # Use two hidden layers and dropout layers with relu activation function. included regularizes
                # to mitigate overfitting since neural networks are prone to it.
                neural_network = keras.Sequential([
                    # Input layer
                    keras.layers.InputLayer(input_shape=(features.shape[1],)),
                    # Hidden layers
                    keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
                    keras.layers.Dropout(0.2),
                    keras.layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
                    keras.layers.Dropout(0.2),
                    # Output Layer
                    keras.layers.Dense(3, activation='softmax')
                ])
                # Chose the adam optimizer with categorical loss and used to accuracy for metrics
                neural_network.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

                # Final hyperparameters: 100 epochs and 1/3 validation split, otherwise default values
                constraints = neural_network.fit(trainFeatures,trainLabels,epochs=100,batch_size=32,validation_split=0.2,verbose=1)

                loss, modelAccuracy = neural_network.evaluate(testFeatures,testLabels)

                print(f"\nTest accuracy : {modelAccuracy:4f}")

                # Make predictions on the test set and then convert back into class labels.
                # Afterward, convert those labels from one hot to original class labels
                predictions = neural_network.predict(testFeatures)
                predictedLabels = predictions.argmax(axis=1)
                trueLabels = testLabels.argmax(axis=1)

                report = classification_report(trueLabels, predictedLabels, labels=[0, 1, 2], target_names=encoder.classes_)
                print(report)

                # Plot training & validation accuracy and loss
                plt.figure(figsize=(12, 4))

                plt.subplot(1, 2, 1)
                plt.plot(constraints.history['accuracy'])
                plt.plot(constraints.history['val_accuracy'])
                plt.title('Model accuracy')
                plt.ylabel('Accuracy')
                plt.xlabel('Epoch')
                plt.legend(['Train', 'Validation'])

                plt.subplot(1, 2, 2)
                plt.plot(constraints.history['loss'])
                plt.plot(constraints.history['val_loss'])
                plt.title('Model loss')
                plt.ylabel('Loss')
                plt.xlabel('Epoch')
                plt.legend(['Train', 'Validation'])

                plt.tight_layout()
                plt.show()
            

                # This model uses logistic regression to determine if a wine has a quality
                # of 60% or higher based on attributes such as pH level, density, acidity, sulfur dioxide, and
                # many other features. The algorithm first changes the numerical labels into binary, scale and
                # regulate the features, then finally visualize the accuracy of the model.
                
                import pandas as pd
                from sklearn.preprocessing import StandardScaler, PolynomialFeatures
                from sklearn.linear_model import LogisticRegression
                from sklearn.model_selection import train_test_split
                from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
                import seaborn as sns
                import matplotlib.pyplot as plt

                # FIND DATASET : https://archive.ics.uci.edu/dataset/186/wine+quality

                # convert dataset to dataframe and separate the features and labels
                data = pd.read_excel("winequality-white.xlsx",header = 0)

                features = data.iloc[:, 0:-1]
                target = data.iloc[:,-1] 
                target_binary = (target > 6).astype(int)

                training_features, test_features, training_labels, test_labels =\
                train_test_split(features, target_binary, test_size=0.3, random_state=42)

                # Scale features
                scaler = StandardScaler()
                training_features = scaler.fit_transform(training_features)
                test_features = scaler.transform(test_features)

                # Add polynomial features. By expanding the feature space, 
                # polynomial features allow the model to fit more complex patterns in the data
                poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
                training_features_poly = poly.fit_transform(training_features)
                test_features_poly = poly.transform(test_features)


                # Logistic Regression with regularization
                logistic_model = LogisticRegression(solver='newton-cg', C=1,random_state=42)
                logistic_model.fit(training_features_poly, training_labels)


                label_predictions = logistic_model.predict(test_features_poly)
                accuracy = accuracy_score(test_labels, label_predictions)
                print("Target therehold 7 :")
                print("Accuracy: {:.2f}%".format(accuracy * 100))

                #Visualization
                print(classification_report(test_labels, label_predictions))
                conf_matrix = confusion_matrix(test_labels, label_predictions)
                sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
                plt.xlabel("Predicted")
                plt.ylabel("Actual")
                plt.title("Confusion Matrix")
                plt.show()

            

                # The following code builds a Random Forest Model with goal to determine a wine sample's
                # quality based on features such as pH level, density, acidity, sulfur dioxide, and
                # many other features. The output results in 'Bad', 'Good', or 'Great' wine.
                # The network as 2 hidden layers with one input and output layer. A report
                # is also printed on classification resutls.
                
                import pandas as pd
                import numpy as np
                from sklearn.metrics import accuracy_score, classification_report
                from sklearn.model_selection import train_test_split, learning_curve
                from sklearn.preprocessing import StandardScaler
                from sklearn.ensemble import RandomForestClassifier
                import matplotlib.pyplot as plt

                # FIND DATASET : https://archive.ics.uci.edu/dataset/186/wine+quality

                # open the txt with the dataset
                data = pd.read_excel("winequality-white.xlsx",header = 0)

                features = data.iloc[:, 0:-1]
                target = data.iloc[:,-1]

                #convert labels from discrete to ordinal for a better classification results
                target = pd.Series(["bad" if elm <= 4 else "okay" if 4 < elm < 8 else "great" for elm in target])

                #Split and standardize the dataset prior to fitting
                training_features, test_features, training_labels, test_labels = train_test_split(features, target, test_size=0.3, random_state=42)

                scaler = StandardScaler()
                training_features = scaler.fit_transform(training_features)
                test_features = scaler.transform(test_features)

                random_forest = RandomForestClassifier(n_estimators=150,max_depth=25,min_samples_split=2,random_state=42)
                random_forest.fit(training_features, training_labels)


                label_predictions = random_forest.predict(test_features)
                accuracy = accuracy_score(test_labels, label_predictions)
                print("Accuracy: {:.2f}%".format(accuracy * 100))

                # Classification Report
                report = classification_report(test_labels, label_predictions, target_names=["bad", "okay", "great"])
                print("\nClassification Report:\n")
                print(report)

                # Generate learning curve data and calculate mean and standard deviation for training/test scores
                train_sizes, train_scores, test_scores = learning_curve(
                    random_forest, training_features, training_labels, cv=5, scoring='accuracy', n_jobs=-1)


                train_mean = np.mean(train_scores, axis=1)
                train_std = np.std(train_scores, axis=1)
                test_mean = np.mean(test_scores, axis=1)
                test_std = np.std(test_scores, axis=1)

                # Plot learning curve
                plt.figure(figsize=(10, 6))
                plt.plot(train_sizes, train_mean, label="Training Score", color="blue")
                plt.plot(train_sizes, test_mean, label="Cross-Validation Score", color="orange")
                plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="blue", alpha=0.2)
                plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="orange", alpha=0.2)
                plt.title("Learning Curve")
                plt.xlabel("Training Set Size")
                plt.ylabel("Accuracy Score")
                plt.legend(loc="best")
                plt.grid()
                plt.show()