Rabin–Karp Algorithm
1. Algorithm Explanation
The Rabin–Karp algorithm is a string‐searching (or substring‐searching) algorithm that uses hashing to find any one of a set of pattern strings in a text. It compares hash values of the pattern and substrings of the text, and only when the hash values match does it fall back to a direct character‐by‐character check.
- Precompute the hash hp of the pattern of length m.
- Compute the hash ht of the first window (first m characters) of the text of length n.
- Slide the window over the text one character at a time, updating the rolling hash in O(1):
- Whenever ht == hp, perform a direct comparison to verify (to avoid false positives due to hash collisions).
ht = (base · (prev_hash − text[i-1]·basem−1) + text[i+m−1]) mod M
Time Complexity
- Average case: O(n + m). Each window hash update is O(1), and mismatched hashes skip the full compare.
- Worst case: O(n·m) if every hash collision forces a full compare.
Memory Complexity
- O(1) extra space (ignoring input and output): only a few integer variables for the rolling hash.
2. Easy Problems
2.1. Check if a Pattern Exists
Given text s
and pattern p
, return true
if p
is a substring of s
.
#include <iostream>
#include <string>
using namespace std;
bool rabinKarp(const string &s, const string &p) {
int n = s.size(), m = p.size();
if (m > n) return false;
const long long base = 256, mod = 1e9 + 7;
long long hp = 0, ht = 0, power = 1;
// precompute highest base^(m-1)
for (int i = 0; i < m - 1; ++i) power = (power * base) % mod;
// initial hash values
for (int i = 0; i < m; ++i) {
hp = (hp * base + p[i]) % mod;
ht = (ht * base + s[i]) % mod;
}
// slide over text
for (int i = 0; i + m <= n; ++i) {
if (hp == ht && s.substr(i, m) == p) return true;
if (i + m < n) {
ht = (ht - s[i] * power % mod + mod) % mod;
ht = (ht * base + s[i + m]) % mod;
}
}
return false;
}
int main() {
cout << boolalpha
<< rabinKarp("hello world", "world") // true
<< endl;
return 0;
}
2.2. Find All Occurrences of a Pattern
Return all start indices where p
occurs in s
.
#include <iostream>
#include <vector>
#include <string>
using namespace std;
vector<int> findAll(const string &s, const string &p) {
int n = s.size(), m = p.size();
vector<int> res;
if (m > n) return res;
const long long base = 256, mod = 1e9 + 7;
long long hp = 0, ht = 0, power = 1;
for (int i = 0; i < m - 1; ++i) power = (power * base) % mod;
for (int i = 0; i < m; ++i) {
hp = (hp * base + p[i]) % mod;
ht = (ht * base + s[i]) % mod;
}
for (int i = 0; i + m <= n; ++i) {
if (hp == ht && s.substr(i, m) == p) res.push_back(i);
if (i + m < n) {
ht = (ht - s[i] * power % mod + mod) % mod;
ht = (ht * base + s[i + m]) % mod;
}
}
return res;
}
int main() {
auto idxs = findAll("abracadabra", "abra");
for (int i: idxs) cout << i << ' '; // 0 7
cout << endl;
return 0;
}
3. Intermediate Problems
3.1. 2D Pattern Matching
Given an R×C
grid of characters and an r×c
pattern, find all top-left corners where the pattern appears.
We treat each row as a string, hash each row window, then hash the column of row-hashes.
#include <bits/stdc++.h>
using namespace std;
using ull = unsigned long long;
// compute rolling hash for each row segment of width c
vector<vector<ull>> rowHashes(const vector<string>& G, int r, int c, ull base, ull mod) {
int R = G.size(), C = G[0].size();
ull power = 1;
for (int i = 0; i < c - 1; ++i) power = (power * base) % mod;
vector<vector<ull>> H(R, vector<ull>(C - c + 1));
for (int i = 0; i < R; ++i) {
ull h = 0;
for (int j = 0; j < c; ++j) h = (h * base + G[i][j]) % mod;
H[i][0] = h;
for (int j = 1; j + c <= C; ++j) {
h = (h + mod - G[i][j-1] * power % mod) % mod;
h = (h * base + G[i][j + c - 1]) % mod;
H[i][j] = h;
}
}
return H;
}
vector<pair<int,int>> find2D(const vector<string>& G, const vector<string>& P) {
int R = G.size(), C = G[0].size();
int r = P.size(), c = P[0].size();
const ull base = 131, mod = 1000000007;
auto Hg = rowHashes(G, r, c, base, mod);
auto Hp = rowHashes(P, r, c, base, mod);
// hash each column of row-hashes
ull power = 1;
for (int i = 0; i < r - 1; ++i) power = (power * base) % mod;
vector<pair<int,int>> res;
for (int j = 0; j + c <= C; ++j) {
// initial column hash for window rows [0..r-1]
ull col_hg = 0, col_hp = 0;
for (int i = 0; i < r; ++i) {
col_hg = (col_hg * base + Hg[i][j]) % mod;
col_hp = (col_hp * base + Hp[i][0]) % mod;
}
if (col_hg == col_hp) res.emplace_back(0, j);
for (int i = 1; i + r <= R; ++i) {
col_hg = (col_hg + mod - Hg[i-1][j] * power % mod) % mod;
col_hg = (col_hg * base + Hg[i+r-1][j]) % mod;
col_hp = (col_hp + mod - Hp[i-1][0] * power % mod) % mod;
col_hp = (col_hp * base + Hp[i+r-1][0]) % mod;
if (col_hg == col_hp) res.emplace_back(i, j);
}
}
return res;
}
int main() {
vector<string> G = {
"1234123412",
"5612561256",
"1234123412",
"5612561256"
};
vector<string> P = {
"1234",
"5612"
};
for (auto &pr: find2D(G, P))
cout << "(" << pr.first << "," << pr.second << ") ";
// outputs (0,0) (0,4) (2,0) (2,4)
return 0;
}
3.2. Detect Duplicate Substrings of Length K
Given string s
and integer K
, determine if any substring of length K
appears at least twice.
Use a hash‐set of rolling hashes.
#include <iostream>
#include <string>
#include <unordered_set>
using namespace std;
bool hasDuplicateLengthK(const string &s, int K) {
int n = s.size();
if (K > n) return false;
const long long base = 256, mod = 1e9+7;
long long h = 0, power = 1;
for (int i = 0; i < K-1; ++i) power = (power * base) % mod;
unordered_set<long long> seen;
for (int i = 0; i < n; ++i) {
h = (h * base + s[i]) % mod;
if (i >= K) {
h = (h + mod - s[i-K] * power % mod) % mod;
}
if (i >= K-1) {
if (seen.count(h)) return true;
seen.insert(h);
}
}
return false;
}
int main() {
cout << boolalpha
<< hasDuplicateLengthK("banana", 3) // true ("ana")
<< endl;
return 0;
}
4. Hard Problem
4.1. Longest Common Substring of Two Strings
Given two strings A
and B
, find the length (and optionally the substring) of their longest common substring.
Use binary search on length L
, and for each L
check via rolling‐hash set.
#include <bits/stdc++.h>
using namespace std;
using ull = unsigned long long;
int longestCommonSubstring(const string &A, const string &B) {
int n = A.size(), m = B.size();
const ull base = 131, mod = 1000000007;
auto check = [&](int L) {
if (L == 0) return -1;
ull power = 1;
for (int i = 0; i < L-1; ++i) power = (power * base) % mod;
unordered_set<ull> hashes;
ull h = 0;
// hashes of A substrings of length L
for (int i = 0; i < n; ++i) {
h = (h * base + A[i]) % mod;
if (i >= L) h = (h + mod - A[i-L] * power % mod) % mod;
if (i >= L-1) hashes.insert(h);
}
// check in B
h = 0;
for (int i = 0; i < m; ++i) {
h = (h * base + B[i]) % mod;
if (i >= L) h = (h + mod - B[i-L] * power % mod) % mod;
if (i >= L-1 && hashes.count(h)) return i - L + 1;
}
return -1;
};
int lo = 0, hi = min(n, m), best = 0;
while (lo <= hi) {
int mid = (lo + hi) / 2;
if (check(mid) != -1) {
best = mid;
lo = mid + 1;
} else {
hi = mid - 1;
}
}
return best;
}
int main() {
cout <<
longestCommonSubstring("ABABC", "BABCA") // 4 ("BABC" or "ABCA")
<< endl;
return 0;
}