Rabin–Karp Algorithm

1. Algorithm Explanation

The Rabin–Karp algorithm is a string‐searching (or substring‐searching) algorithm that uses hashing to find any one of a set of pattern strings in a text. It compares hash values of the pattern and substrings of the text, and only when the hash values match does it fall back to a direct character‐by‐character check.

  1. Precompute the hash hp of the pattern of length m.
  2. Compute the hash ht of the first window (first m characters) of the text of length n.
  3. Slide the window over the text one character at a time, updating the rolling hash in O(1):
  4. 
    ht = (base · (prev_hash − text[i-1]·basem−1) + text[i+m−1]) mod M
          
  5. Whenever ht == hp, perform a direct comparison to verify (to avoid false positives due to hash collisions).

Time Complexity

  • Average case: O(n + m). Each window hash update is O(1), and mismatched hashes skip the full compare.
  • Worst case: O(n·m) if every hash collision forces a full compare.

Memory Complexity

  • O(1) extra space (ignoring input and output): only a few integer variables for the rolling hash.

2. Easy Problems

2.1. Check if a Pattern Exists

Given text s and pattern p, return true if p is a substring of s.


#include <iostream>
#include <string>
using namespace std;

bool rabinKarp(const string &s, const string &p) {
    int n = s.size(), m = p.size();
    if (m > n) return false;
    const long long base = 256, mod = 1e9 + 7;
    long long hp = 0, ht = 0, power = 1;
    // precompute highest base^(m-1)
    for (int i = 0; i < m - 1; ++i) power = (power * base) % mod;
    // initial hash values
    for (int i = 0; i < m; ++i) {
        hp = (hp * base + p[i]) % mod;
        ht = (ht * base + s[i]) % mod;
    }
    // slide over text
    for (int i = 0; i + m <= n; ++i) {
        if (hp == ht && s.substr(i, m) == p) return true;
        if (i + m < n) {
            ht = (ht - s[i] * power % mod + mod) % mod;
            ht = (ht * base + s[i + m]) % mod;
        }
    }
    return false;
}

int main() {
    cout << boolalpha
         << rabinKarp("hello world", "world")  // true
         << endl;
    return 0;
}
    

2.2. Find All Occurrences of a Pattern

Return all start indices where p occurs in s.


#include <iostream>
#include <vector>
#include <string>
using namespace std;

vector<int> findAll(const string &s, const string &p) {
    int n = s.size(), m = p.size();
    vector<int> res;
    if (m > n) return res;
    const long long base = 256, mod = 1e9 + 7;
    long long hp = 0, ht = 0, power = 1;
    for (int i = 0; i < m - 1; ++i) power = (power * base) % mod;
    for (int i = 0; i < m; ++i) {
        hp = (hp * base + p[i]) % mod;
        ht = (ht * base + s[i]) % mod;
    }
    for (int i = 0; i + m <= n; ++i) {
        if (hp == ht && s.substr(i, m) == p) res.push_back(i);
        if (i + m < n) {
            ht = (ht - s[i] * power % mod + mod) % mod;
            ht = (ht * base + s[i + m]) % mod;
        }
    }
    return res;
}

int main() {
    auto idxs = findAll("abracadabra", "abra");
    for (int i: idxs) cout << i << ' ';  // 0 7
    cout << endl;
    return 0;
}
    

3. Intermediate Problems

3.1. 2D Pattern Matching

Given an R×C grid of characters and an r×c pattern, find all top-left corners where the pattern appears. We treat each row as a string, hash each row window, then hash the column of row-hashes.


#include <bits/stdc++.h>
using namespace std;
using ull = unsigned long long;

// compute rolling hash for each row segment of width c
vector<vector<ull>> rowHashes(const vector<string>& G, int r, int c, ull base, ull mod) {
    int R = G.size(), C = G[0].size();
    ull power = 1;
    for (int i = 0; i < c - 1; ++i) power = (power * base) % mod;
    vector<vector<ull>> H(R, vector<ull>(C - c + 1));
    for (int i = 0; i < R; ++i) {
        ull h = 0;
        for (int j = 0; j < c; ++j) h = (h * base + G[i][j]) % mod;
        H[i][0] = h;
        for (int j = 1; j + c <= C; ++j) {
            h = (h + mod - G[i][j-1] * power % mod) % mod;
            h = (h * base + G[i][j + c - 1]) % mod;
            H[i][j] = h;
        }
    }
    return H;
}

vector<pair<int,int>> find2D(const vector<string>& G, const vector<string>& P) {
    int R = G.size(), C = G[0].size();
    int r = P.size(), c = P[0].size();
    const ull base = 131, mod = 1000000007;
    auto Hg = rowHashes(G, r, c, base, mod);
    auto Hp = rowHashes(P, r, c, base, mod);
    // hash each column of row-hashes
    ull power = 1;
    for (int i = 0; i < r - 1; ++i) power = (power * base) % mod;
    vector<pair<int,int>> res;
    for (int j = 0; j + c <= C; ++j) {
        // initial column hash for window rows [0..r-1]
        ull col_hg = 0, col_hp = 0;
        for (int i = 0; i < r; ++i) {
            col_hg = (col_hg * base + Hg[i][j]) % mod;
            col_hp = (col_hp * base + Hp[i][0]) % mod;
        }
        if (col_hg == col_hp) res.emplace_back(0, j);
        for (int i = 1; i + r <= R; ++i) {
            col_hg = (col_hg + mod - Hg[i-1][j] * power % mod) % mod;
            col_hg = (col_hg * base + Hg[i+r-1][j]) % mod;
            col_hp = (col_hp + mod - Hp[i-1][0] * power % mod) % mod;
            col_hp = (col_hp * base + Hp[i+r-1][0]) % mod;
            if (col_hg == col_hp) res.emplace_back(i, j);
        }
    }
    return res;
}

int main() {
    vector<string> G = {
      "1234123412",
      "5612561256",
      "1234123412",
      "5612561256"
    };
    vector<string> P = {
      "1234",
      "5612"
    };
    for (auto &pr: find2D(G, P))
      cout << "(" << pr.first << "," << pr.second << ") ";
    // outputs (0,0) (0,4) (2,0) (2,4)
    return 0;
}
    

3.2. Detect Duplicate Substrings of Length K

Given string s and integer K, determine if any substring of length K appears at least twice. Use a hash‐set of rolling hashes.


#include <iostream>
#include <string>
#include <unordered_set>
using namespace std;

bool hasDuplicateLengthK(const string &s, int K) {
    int n = s.size();
    if (K > n) return false;
    const long long base = 256, mod = 1e9+7;
    long long h = 0, power = 1;
    for (int i = 0; i < K-1; ++i) power = (power * base) % mod;
    unordered_set<long long> seen;
    for (int i = 0; i < n; ++i) {
        h = (h * base + s[i]) % mod;
        if (i >= K) {
            h = (h + mod - s[i-K] * power % mod) % mod;
        }
        if (i >= K-1) {
            if (seen.count(h)) return true;
            seen.insert(h);
        }
    }
    return false;
}

int main() {
    cout << boolalpha
         << hasDuplicateLengthK("banana", 3)  // true ("ana")
         << endl;
    return 0;
}
    

4. Hard Problem

4.1. Longest Common Substring of Two Strings

Given two strings A and B, find the length (and optionally the substring) of their longest common substring. Use binary search on length L, and for each L check via rolling‐hash set.


#include <bits/stdc++.h>
using namespace std;
using ull = unsigned long long;

int longestCommonSubstring(const string &A, const string &B) {
    int n = A.size(), m = B.size();
    const ull base = 131, mod = 1000000007;

    auto check = [&](int L) {
        if (L == 0) return -1;
        ull power = 1;
        for (int i = 0; i < L-1; ++i) power = (power * base) % mod;
        unordered_set<ull> hashes;
        ull h = 0;
        // hashes of A substrings of length L
        for (int i = 0; i < n; ++i) {
            h = (h * base + A[i]) % mod;
            if (i >= L) h = (h + mod - A[i-L] * power % mod) % mod;
            if (i >= L-1) hashes.insert(h);
        }
        // check in B
        h = 0;
        for (int i = 0; i < m; ++i) {
            h = (h * base + B[i]) % mod;
            if (i >= L) h = (h + mod - B[i-L] * power % mod) % mod;
            if (i >= L-1 && hashes.count(h)) return i - L + 1;
        }
        return -1;
    };

    int lo = 0, hi = min(n, m), best = 0;
    while (lo <= hi) {
        int mid = (lo + hi) / 2;
        if (check(mid) != -1) {
            best = mid;
            lo = mid + 1;
        } else {
            hi = mid - 1;
        }
    }
    return best;
}

int main() {
    cout <<
      longestCommonSubstring("ABABC", "BABCA")  // 4 ("BABC" or "ABCA")
      << endl;
    return 0;
}