diff --git a/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java new file mode 100644 index 000000000..1cfd2bbad --- /dev/null +++ b/src/main/java/com/thealgorithms/searches/BM25InvertedIndex.java @@ -0,0 +1,220 @@ +package com.thealgorithms.searches; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +/** + * Inverted Index implementation with BM25 Scoring for movie search. + * This class supports adding movie documents and searching for terms + * within those documents using the BM25 algorithm. + * @author Prayas Kumar (https://github.com/prayas7102) + */ + +class Movie { + int docId; // Unique identifier for the movie + String name; // Movie name + double imdbRating; // IMDb rating of the movie + int releaseYear; // Year the movie was released + String content; // Full text content (could be the description or script) + + /** + * Constructor for the Movie class. + * @param docId Unique identifier for the movie. + * @param name Name of the movie. + * @param imdbRating IMDb rating of the movie. + * @param releaseYear Release year of the movie. + * @param content Content or description of the movie. + */ + Movie(int docId, String name, double imdbRating, int releaseYear, String content) { + this.docId = docId; + this.name = name; + this.imdbRating = imdbRating; + this.releaseYear = releaseYear; + this.content = content; + } + + /** + * Get all the words from the movie's name and content. + * Converts the name and content to lowercase and splits on non-word characters. + * @return Array of words from the movie name and content. + */ + public String[] getWords() { + return (name + " " + content).toLowerCase().split("\\W+"); + } + + @Override + public String toString() { + return "Movie{" + + "docId=" + docId + ", name='" + name + '\'' + ", imdbRating=" + imdbRating + ", releaseYear=" + releaseYear + '}'; + } +} + +class SearchResult { + int docId; // Unique identifier of the movie document + double relevanceScore; // Relevance score based on the BM25 algorithm + + /** + * Constructor for SearchResult class. + * @param docId Document ID (movie) for this search result. + * @param relevanceScore The relevance score based on BM25 scoring. + */ + SearchResult(int docId, double relevanceScore) { + this.docId = docId; + this.relevanceScore = relevanceScore; + } + + public int getDocId() { + return docId; + } + + @Override + public String toString() { + return "SearchResult{" + + "docId=" + docId + ", relevanceScore=" + relevanceScore + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SearchResult that = (SearchResult) o; + return docId == that.docId && Double.compare(that.relevanceScore, relevanceScore) == 0; + } + + @Override + public int hashCode() { + return Objects.hash(docId, relevanceScore); + } + + public double getRelevanceScore() { + return this.relevanceScore; + } +} + +public final class BM25InvertedIndex { + private Map> index; // Inverted index mapping terms to document id and frequency + private Map movies; // Mapping of movie document IDs to Movie objects + private int totalDocuments; // Total number of movies/documents + private double avgDocumentLength; // Average length of documents (number of words) + private static final double K = 1.5; // BM25 tuning parameter, controls term frequency saturation + private static final double B = 0.75; // BM25 tuning parameter, controls length normalization + + /** + * Constructor for BM25InvertedIndex. + * Initializes the inverted index and movie storage. + */ + BM25InvertedIndex() { + index = new HashMap<>(); + movies = new HashMap<>(); + totalDocuments = 0; + avgDocumentLength = 0.0; + } + + /** + * Add a movie to the index. + * @param docId Unique identifier for the movie. + * @param name Name of the movie. + * @param imdbRating IMDb rating of the movie. + * @param releaseYear Release year of the movie. + * @param content Content or description of the movie. + */ + public void addMovie(int docId, String name, double imdbRating, int releaseYear, String content) { + Movie movie = new Movie(docId, name, imdbRating, releaseYear, content); + movies.put(docId, movie); + totalDocuments++; + + // Get words (terms) from the movie's name and content + String[] terms = movie.getWords(); + int docLength = terms.length; + + // Update the average document length + avgDocumentLength = (avgDocumentLength * (totalDocuments - 1) + docLength) / totalDocuments; + + // Update the inverted index + for (String term : terms) { + // Create a new entry if the term is not yet in the index + index.putIfAbsent(term, new HashMap<>()); + + // Get the list of documents containing the term + Map docList = index.get(term); + if (docList == null) { + docList = new HashMap<>(); + index.put(term, docList); // Ensure docList is added to the index + } + // Increment the term frequency in this document + docList.put(docId, docList.getOrDefault(docId, 0) + 1); + } + } + + public int getMoviesLength() { + return movies.size(); + } + + /** + * Search for documents containing a term using BM25 scoring. + * @param term The search term. + * @return A list of search results sorted by relevance score. + */ + public List search(String term) { + term = term.toLowerCase(); // Normalize search term + if (!index.containsKey(term)) { + return new ArrayList<>(); // Return empty list if term not found + } + + Map termDocs = index.get(term); // Documents containing the term + List results = new ArrayList<>(); + + // Compute IDF for the search term + double idf = computeIDF(termDocs.size()); + + // Calculate relevance scores for all documents containing the term + for (Map.Entry entry : termDocs.entrySet()) { + int docId = entry.getKey(); + int termFrequency = entry.getValue(); + Movie movie = movies.get(docId); + if (movie == null) { + continue; // Skip this document if movie doesn't exist + } + double docLength = movie.getWords().length; + + // Compute BM25 relevance score + double score = computeBM25Score(termFrequency, docLength, idf); + results.add(new SearchResult(docId, score)); + } + + // Sort the results by relevance score in descending order + results.sort((r1, r2) -> Double.compare(r2.relevanceScore, r1.relevanceScore)); + return results; + } + + /** + * Compute the BM25 score for a given term and document. + * @param termFrequency The frequency of the term in the document. + * @param docLength The length of the document. + * @param idf The inverse document frequency of the term. + * @return The BM25 relevance score for the term in the document. + */ + private double computeBM25Score(int termFrequency, double docLength, double idf) { + double numerator = termFrequency * (K + 1); + double denominator = termFrequency + K * (1 - B + B * (docLength / avgDocumentLength)); + return idf * (numerator / denominator); + } + + /** + * Compute the inverse document frequency (IDF) of a term. + * The IDF measures the importance of a term across the entire document set. + * @param docFrequency The number of documents that contain the term. + * @return The inverse document frequency (IDF) value. + */ + private double computeIDF(int docFrequency) { + // Total number of documents in the index + return Math.log((totalDocuments - docFrequency + 0.5) / (docFrequency + 0.5)); + } +} diff --git a/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java new file mode 100644 index 000000000..8595e0a00 --- /dev/null +++ b/src/test/java/com/thealgorithms/searches/BM25InvertedIndexTest.java @@ -0,0 +1,93 @@ +package com.thealgorithms.searches; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Test Cases for Inverted Index with BM25 + * @author Prayas Kumar (https://github.com/prayas7102) + */ + +class BM25InvertedIndexTest { + + private static BM25InvertedIndex index; + + @BeforeAll + static void setUp() { + index = new BM25InvertedIndex(); + index.addMovie(1, "The Shawshank Redemption", 9.3, 1994, "Hope is a good thing. Maybe the best of things. And no good thing ever dies."); + index.addMovie(2, "The Godfather", 9.2, 1972, "I'm gonna make him an offer he can't refuse."); + index.addMovie(3, "The Dark Knight", 9.0, 2008, "You either die a hero or live long enough to see yourself become the villain."); + index.addMovie(4, "Pulp Fiction", 8.9, 1994, "You know what they call a Quarter Pounder with Cheese in Paris? They call it a Royale with Cheese."); + index.addMovie(5, "Good Will Hunting", 8.3, 1997, "Will Hunting is a genius and he has a good heart. The best of his abilities is yet to be explored."); + index.addMovie(6, "It's a Wonderful Life", 8.6, 1946, "Each man's life touches so many other lives. If he wasn't around, it would leave an awfully good hole."); + index.addMovie(7, "The Pursuit of Happyness", 8.0, 2006, "It was the pursuit of a better life, and a good opportunity to change things for the better."); + index.addMovie(8, "A Few Good Men", 7.7, 1992, "You can't handle the truth! This movie has a lot of good moments and intense drama."); + } + + @Test + void testAddMovie() { + // Check that the index contains the correct number of movies + int moviesLength = index.getMoviesLength(); + assertEquals(8, moviesLength); + } + + @Test + void testSearchForTermFound() { + int expected = 1; + List result = index.search("hope"); + int actual = result.getFirst().getDocId(); + assertEquals(expected, actual); + } + + @Test + void testSearchRanking() { + // Perform search for the term "good" + List results = index.search("good"); + assertFalse(results.isEmpty()); + + // Validate the ranking based on the provided relevance scores + assertEquals(6, results.get(0).getDocId()); // It's a Wonderful Life should be ranked 1st + assertEquals(7, results.get(1).getDocId()); // The Pursuit of Happyness should be ranked 2nd + assertEquals(5, results.get(2).getDocId()); // Good Will Hunting should be ranked 3rd + assertEquals(8, results.get(3).getDocId()); // A Few Good Men should be ranked 4th + assertEquals(1, results.get(4).getDocId()); // The Shawshank Redemption should be ranked 5th + + // Ensure the relevance scores are in descending order + for (int i = 0; i < results.size() - 1; i++) { + assertTrue(results.get(i).getRelevanceScore() > results.get(i + 1).getRelevanceScore()); + } + } + + @Test + void testSearchForTermNotFound() { + List results = index.search("nonexistent"); + assertTrue(results.isEmpty()); + } + + @Test + void testSearchForCommonTerm() { + List results = index.search("the"); + assertFalse(results.isEmpty()); + assertTrue(results.size() > 1); + } + + @Test + void testBM25ScoreCalculation() { + List results = index.search("cheese"); + assertEquals(1, results.size()); + assertEquals(4, results.getFirst().docId); // Pulp Fiction should have the highest score + } + + @Test + void testCaseInsensitivity() { + List resultsLowerCase = index.search("hope"); + List resultsUpperCase = index.search("HOPE"); + assertEquals(resultsLowerCase, resultsUpperCase); + } +}