package com.thealgorithms.searches; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; /** * Inverted Index implementation with BM25 Scoring for movie search. * This class supports adding movie documents and searching for terms * within those documents using the BM25 algorithm. * @author Prayas Kumar (https://github.com/prayas7102) */ class Movie { int docId; // Unique identifier for the movie String name; // Movie name double imdbRating; // IMDb rating of the movie int releaseYear; // Year the movie was released String content; // Full text content (could be the description or script) /** * Constructor for the Movie class. * @param docId Unique identifier for the movie. * @param name Name of the movie. * @param imdbRating IMDb rating of the movie. * @param releaseYear Release year of the movie. * @param content Content or description of the movie. */ Movie(int docId, String name, double imdbRating, int releaseYear, String content) { this.docId = docId; this.name = name; this.imdbRating = imdbRating; this.releaseYear = releaseYear; this.content = content; } /** * Get all the words from the movie's name and content. * Converts the name and content to lowercase and splits on non-word characters. * @return Array of words from the movie name and content. */ public String[] getWords() { return (name + " " + content).toLowerCase().split("\\W+"); } @Override public String toString() { return "Movie{" + "docId=" + docId + ", name='" + name + '\'' + ", imdbRating=" + imdbRating + ", releaseYear=" + releaseYear + '}'; } } class SearchResult { int docId; // Unique identifier of the movie document double relevanceScore; // Relevance score based on the BM25 algorithm /** * Constructor for SearchResult class. * @param docId Document ID (movie) for this search result. * @param relevanceScore The relevance score based on BM25 scoring. */ SearchResult(int docId, double relevanceScore) { this.docId = docId; this.relevanceScore = relevanceScore; } public int getDocId() { return docId; } @Override public String toString() { return "SearchResult{" + "docId=" + docId + ", relevanceScore=" + relevanceScore + '}'; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } SearchResult that = (SearchResult) o; return docId == that.docId && Double.compare(that.relevanceScore, relevanceScore) == 0; } @Override public int hashCode() { return Objects.hash(docId, relevanceScore); } public double getRelevanceScore() { return this.relevanceScore; } } public final class BM25InvertedIndex { private Map> index; // Inverted index mapping terms to document id and frequency private Map movies; // Mapping of movie document IDs to Movie objects private int totalDocuments; // Total number of movies/documents private double avgDocumentLength; // Average length of documents (number of words) private static final double K = 1.5; // BM25 tuning parameter, controls term frequency saturation private static final double B = 0.75; // BM25 tuning parameter, controls length normalization /** * Constructor for BM25InvertedIndex. * Initializes the inverted index and movie storage. */ BM25InvertedIndex() { index = new HashMap<>(); movies = new HashMap<>(); totalDocuments = 0; avgDocumentLength = 0.0; } /** * Add a movie to the index. * @param docId Unique identifier for the movie. * @param name Name of the movie. * @param imdbRating IMDb rating of the movie. * @param releaseYear Release year of the movie. * @param content Content or description of the movie. */ public void addMovie(int docId, String name, double imdbRating, int releaseYear, String content) { Movie movie = new Movie(docId, name, imdbRating, releaseYear, content); movies.put(docId, movie); totalDocuments++; // Get words (terms) from the movie's name and content String[] terms = movie.getWords(); int docLength = terms.length; // Update the average document length avgDocumentLength = (avgDocumentLength * (totalDocuments - 1) + docLength) / totalDocuments; // Update the inverted index for (String term : terms) { // Create a new entry if the term is not yet in the index index.putIfAbsent(term, new HashMap<>()); // Get the list of documents containing the term Map docList = index.get(term); if (docList == null) { docList = new HashMap<>(); index.put(term, docList); // Ensure docList is added to the index } // Increment the term frequency in this document docList.put(docId, docList.getOrDefault(docId, 0) + 1); } } public int getMoviesLength() { return movies.size(); } /** * Search for documents containing a term using BM25 scoring. * @param term The search term. * @return A list of search results sorted by relevance score. */ public List search(String term) { term = term.toLowerCase(); // Normalize search term if (!index.containsKey(term)) { return new ArrayList<>(); // Return empty list if term not found } Map termDocs = index.get(term); // Documents containing the term List results = new ArrayList<>(); // Compute IDF for the search term double idf = computeIDF(termDocs.size()); // Calculate relevance scores for all documents containing the term for (Map.Entry entry : termDocs.entrySet()) { int docId = entry.getKey(); int termFrequency = entry.getValue(); Movie movie = movies.get(docId); if (movie == null) { continue; // Skip this document if movie doesn't exist } double docLength = movie.getWords().length; // Compute BM25 relevance score double score = computeBM25Score(termFrequency, docLength, idf); results.add(new SearchResult(docId, score)); } // Sort the results by relevance score in descending order results.sort((r1, r2) -> Double.compare(r2.relevanceScore, r1.relevanceScore)); return results; } /** * Compute the BM25 score for a given term and document. * @param termFrequency The frequency of the term in the document. * @param docLength The length of the document. * @param idf The inverse document frequency of the term. * @return The BM25 relevance score for the term in the document. */ private double computeBM25Score(int termFrequency, double docLength, double idf) { double numerator = termFrequency * (K + 1); double denominator = termFrequency + K * (1 - B + B * (docLength / avgDocumentLength)); return idf * (numerator / denominator); } /** * Compute the inverse document frequency (IDF) of a term. * The IDF measures the importance of a term across the entire document set. * @param docFrequency The number of documents that contain the term. * @return The inverse document frequency (IDF) value. */ private double computeIDF(int docFrequency) { // Total number of documents in the index return Math.log((totalDocuments - docFrequency + 0.5) / (docFrequency + 0.5) + 1); } }