Welcome to Subscribe On Youtube

187. Repeated DNA Sequences

Description

The DNA sequence is composed of a series of nucleotides abbreviated as 'A', 'C', 'G', and 'T'.

  • For example, "ACGAATTCCG" is a DNA sequence.

When studying DNA, it is useful to identify repeated sequences within the DNA.

Given a string s that represents a DNA sequence, return all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.

 

Example 1:

Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC","CCCCCAAAAA"]

Example 2:

Input: s = "AAAAAAAAAAAAA"
Output: ["AAAAAAAAAA"]

 

Constraints:

  • 1 <= s.length <= 105
  • s[i] is either 'A', 'C', 'G', or 'T'.

Solutions

Solution 1: Hash Table

We define a hash table $cnt$ to store the occurrence count of all substrings of length $10$.

We iterate through all substrings of length $10$ in the string $s$. For the current substring $t$, we update its count in the hash table. If the count of $t$ is $2$, we add it to the answer.

After the iteration, we return the answer array.

The time complexity is $O(n \times 10)$, and the space complexity is $O(n \times 10)$. Here, $n$ is the length of the string $s$.

A => binary: 00
C => binary: 01
G => binary: 10
T => binary: 11

Basically, this questions is more of a compression problem, how to encode the 10-char string to use less disk space.

  • plain char is 8 bits (2 bytes)
  • encoding to 2 bits (00,01,10,11)

Solution 2: Rabin-Karp String Matching Algorithm

This method essentially combines sliding window and hash. Similar to 28 Find-the-Index-of-the-First-Occurrence-in-a-String. Find the Index of the First Occurrence in a String, this problem can use a hash function to reduce the time complexity of counting subsequences to $O(1)$.

The time complexity is $O(n)$, and the space complexity is $O(n)$. Here, $n$ is the length of the string $s$.

  • class Solution {
        public List<String> findRepeatedDnaSequences(String s) {
            Map<String, Integer> cnt = new HashMap<>();
            List<String> ans = new ArrayList<>();
            for (int i = 0; i < s.length() - 10 + 1; ++i) {
                String t = s.substring(i, i + 10);
                if (cnt.merge(t, 1, Integer::sum) == 2) {
                    ans.add(t);
                }
            }
            return ans;
        }
    }
    
  • class Solution {
    public:
        vector<string> findRepeatedDnaSequences(string s) {
            unordered_map<string, int> cnt;
            vector<string> ans;
            for (int i = 0, n = s.size() - 10 + 1; i < n; ++i) {
                auto t = s.substr(i, 10);
                if (++cnt[t] == 2) {
                    ans.emplace_back(t);
                }
            }
            return ans;
        }
    };
    
  • class Solution:
        def findRepeatedDnaSequences(self, s: str) -> List[str]:
            words = set() # for encoded words
            repeated = set() # for encoded words
            result = [] # not encoded words
    
            # A => binary: 00
            # C => binary: 01
            # G => binary: 10
            # T => binary: 11
            mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
            for i in range(len(s) - 9):
                v = 0
                for j in range(i, i + 10):
                    # every time, use the new 2 bits after shifting for current char
                    v <<= 2
                    v |= mapping[s[j]]
                if v in words and v not in repeated:
                    repeated.add(v)
                    result.append(s[i:i + 10])
    
                words.add(v)
    
            return result
    
    ############
    
    class Solution:
        def findRepeatedDnaSequences(self, s: str) -> List[str]:
            n = len(s) - 10
            cnt = Counter()
            ans = []
            for i in range(n + 1):
                sub = s[i : i + 10]
                cnt[sub] += 1
                if cnt[sub] == 2:
                    ans.append(sub)
            return ans
    
    
  • func findRepeatedDnaSequences(s string) (ans []string) {
    	cnt := map[string]int{}
    	for i := 0; i < len(s)-10+1; i++ {
    		t := s[i : i+10]
    		cnt[t]++
    		if cnt[t] == 2 {
    			ans = append(ans, t)
    		}
    	}
    	return
    }
    
  • function findRepeatedDnaSequences(s: string): string[] {
        const n = s.length;
        const cnt: Map<string, number> = new Map();
        const ans: string[] = [];
        for (let i = 0; i <= n - 10; ++i) {
            const t = s.slice(i, i + 10);
            cnt.set(t, (cnt.get(t) ?? 0) + 1);
            if (cnt.get(t) === 2) {
                ans.push(t);
            }
        }
        return ans;
    }
    
    
  • /**
     * @param {string} s
     * @return {string[]}
     */
    var findRepeatedDnaSequences = function (s) {
        const cnt = new Map();
        const ans = [];
        for (let i = 0; i < s.length - 10 + 1; ++i) {
            const t = s.slice(i, i + 10);
            cnt.set(t, (cnt.get(t) || 0) + 1);
            if (cnt.get(t) === 2) {
                ans.push(t);
            }
        }
        return ans;
    };
    
    
  • public class Solution {
        public IList<string> FindRepeatedDnaSequences(string s) {
            var cnt = new Dictionary<string, int>();
            var ans = new List<string>();
            for (int i = 0; i < s.Length - 10 + 1; ++i) {
                var t = s.Substring(i, 10);
                if (!cnt.ContainsKey(t)) {
                    cnt[t] = 0;
                }
                if (++cnt[t] == 2) {
                    ans.Add(t);
                }
            }
            return ans;
        }
    }
    
  • use std::collections::HashMap;
    
    impl Solution {
        pub fn find_repeated_dna_sequences(s: String) -> Vec<String> {
            if s.len() < 10 {
                return vec![];
            }
            let mut cnt = HashMap::new();
            let mut ans = Vec::new();
            for i in 0..s.len() - 9 {
                let t = &s[i..i + 10];
                let count = cnt.entry(t).or_insert(0);
                *count += 1;
                if *count == 2 {
                    ans.push(t.to_string());
                }
            }
            ans
        }
    }
    
    

All Problems

All Solutions