Welcome to Subscribe On Youtube

Question

Formatted question description: https://leetcode.ca/all/187.html

The DNA sequence is composed of a series of nucleotides abbreviated as 'A', 'C', 'G', and 'T'.

  • For example, "ACGAATTCCG" is a DNA sequence.

When studying DNA, it is useful to identify repeated sequences within the DNA.

Given a string s that represents a DNA sequence, return all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.

 

Example 1:

Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC","CCCCCAAAAA"]

Example 2:

Input: s = "AAAAAAAAAAAAA"
Output: ["AAAAAAAAAA"]

 

Constraints:

  • 1 <= s.length <= 105
  • s[i] is either 'A', 'C', 'G', or 'T'.

Algorithm

Using HashSet can have the characteristics of duplicate items.

A => binary: 00
C => binary: 01
G => binary: 10
T => binary: 11

Basically, this questions is more of a compression problem, how to encode the 10-char string to use less disk space.

  • plain char is 8 bits (2 bytes)
  • encoding to 2 bits (00,01,10,11)

Code

  • 
    public class Repeated_DNA_Sequences {
    
        public static void main(String[] args) {
            Repeated_DNA_Sequences out = new Repeated_DNA_Sequences();
            Solution s = out.new Solution();
    
            System.out.println(s.findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"));
        }
    
        class Solution_optimize {
            public List<String> findRepeatedDnaSequences(String s) {
                Set<Integer> words = new HashSet<>();
    
                // need this to remove duplicates
                // Input "AAAAAAAAAAAAA", result ["AAAAAAAAAA"]
                // to avoid result ["AAAAAAAAAA","AAAAAAAAAA","AAAAAAAAAA"]
                Set<Integer> repteated = new HashSet<>();
                List<String> result = new ArrayList<>();
    
                char[] map = new char[26];
                //map['A' - 'A'] = 0; // binary: 00
                map['C' - 'A'] = 1; // binary: 01
                map['G' - 'A'] = 2; // binary: 10
                map['T' - 'A'] = 3; // binary: 11
    
                for(int i = 0; i < s.length() - 9; i++) {
                    int v = 0;
                    for(int j = i; j < i + 10; j++) {
                        v <<= 2; // every time, use the new 2 bits after shifting for current char
                        v |= map[s.charAt(j) - 'A'];
                    }
                    if(words.contains(v) && !repteated.contains(v)) {
                        repteated.add(v);
                        result.add(s.substring(i, i + 10));
                    }
    
                    words.add(v); // re-add, if already contains
                }
                return result;
            }
        }
    
        class Solution {
            HashSet<String> existed = new HashSet<>();
            HashSet<String> result = new HashSet<>();
    
            public List<String> findRepeatedDnaSequences(String s) {
    
                if (s.length() <= 10) {
                    return new ArrayList<String>();
                }
    
                for (int i = 0; i < 10; i++) {
                    for (int j = i; j + 9 < s.length(); j += 10) {
    
                        String one = s.substring(j, j + 10);
    
                        if (existed.contains(one)) {
                            result.add(one);
                        }
    
                        existed.add(one);
                    }
                }
    
                return new ArrayList<String>(result);
            }
    
        }
    }
    
    ############
    
    class Solution {
        public List<String> findRepeatedDnaSequences(String s) {
            int n = s.length() - 10;
            Map<String, Integer> cnt = new HashMap<>();
            List<String> ans = new ArrayList<>();
            for (int i = 0; i <= n; ++i) {
                String sub = s.substring(i, i + 10);
                cnt.put(sub, cnt.getOrDefault(sub, 0) + 1);
                if (cnt.get(sub) == 2) {
                    ans.add(sub);
                }
            }
            return ans;
        }
    }
    
  • class Solution {
    public:
        vector<string> findRepeatedDnaSequences(string s) {
            map<string, int> cnt;
            int n = s.size() - 10;
            vector<string> ans;
            for (int i = 0; i <= n; ++i) {
                string sub = s.substr(i, 10);
                if (++cnt[sub] == 2) {
                    ans.push_back(sub);
                }
            }
            return ans;
        }
    };
    
  • class Solution:
        def findRepeatedDnaSequences(self, s: str) -> List[str]:
            words = set() # for encoded words
            repeated = set() # for encoded words
            result = [] # not encoded words
    
            # A => binary: 00
            # C => binary: 01
            # G => binary: 10
            # T => binary: 11
            mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    
            for i in range(len(s) - 9):
                v = 0
                for j in range(i, i + 10):
                    # every time, use the new 2 bits after shifting for current char
                    v <<= 2
                    v |= mapping[s[j]]
                if v in words and v not in repeated:
                    repeated.add(v)
                    result.append(s[i:i + 10])
    
                words.add(v)
    
            return result
    
    
    class Solution:
        def findRepeatedDnaSequences(self, s: str) -> List[str]:
            n = len(s) - 10
            cnt = Counter()
            ans = []
            for i in range(n + 1):
                sub = s[i : i + 10]
                cnt[sub] += 1
                if cnt[sub] == 2:
                    ans.append(sub)
            return ans
    
    ############
    
    class Solution(object):
      def findRepeatedDnaSequences(self, s):
        """
        :type s: str
        :rtype: List[str]
        """
        d = {}
        ans = []
        for i in range(len(s) - 9):
          key = s[i:i + 10]
          if key in d:
            d[key] += 1
            if d[key] == 2:
              ans.append(key)
          else:
            d[key] = 1
        return ans
    
    
  • func findRepeatedDnaSequences(s string) []string {
    	hashCode := map[byte]int{'A': 0, 'C': 1, 'G': 2, 'T': 3}
    	ans, cnt, left, right := []string{}, map[int]int{}, 0, 0
    
    	sha, multi := 0, int(math.Pow(4, 9))
    	for ; right < len(s); right++ {
    		sha = sha*4 + hashCode[s[right]]
    		if right-left+1 < 10 {
    			continue
    		}
    		cnt[sha]++
    		if cnt[sha] == 2 {
    			ans = append(ans, s[left:right+1])
    		}
    		sha, left = sha-multi*hashCode[s[left]], left+1
    	}
    	return ans
    }
    
  • function findRepeatedDnaSequences(s: string): string[] {
        const n = s.length;
        const map = new Map<string, boolean>();
        const res = [];
        for (let i = 0; i <= n - 10; i++) {
            const key = s.slice(i, i + 10);
            if (map.has(key) && map.get(key)) {
                res.push(key);
            }
            map.set(key, !map.has(key));
        }
        return res;
    }
    
    
  • /**
     * @param {string} s
     * @return {string[]}
     */
    var findRepeatedDnaSequences = function (s) {
        const n = s.length - 10;
        let cnt = new Map();
        let ans = [];
        for (let i = 0; i <= n; ++i) {
            let sub = s.slice(i, i + 10);
            cnt[sub] = (cnt[sub] || 0) + 1;
            if (cnt[sub] == 2) {
                ans.push(sub);
            }
        }
        return ans;
    };
    
    
  • using System.Collections.Generic;
    
    public class Solution {
        public IList<string> FindRepeatedDnaSequences(string s) {
            var once = new HashSet<int>();
            var moreThanOnce = new HashSet<int>();
            int bits = 0;
            for (var i = 0; i < s.Length; ++i)
            {
                bits <<= 2;
                switch (s[i])
                {
                    case 'A':
                        break;
                    case 'C':
                        bits |= 1;
                        break;
                    case 'G':
                        bits |= 2;
                        break;
                    case 'T':
                        bits |= 3;
                        break;
                }
                if (i >= 10)
                {
                    bits &= 0xFFFFF;
                }
                if (i >= 9 && !once.Add(bits))
                {
                    moreThanOnce.Add(bits);
                }
            }
            
            var results = new List<string>();
            foreach (var item in moreThanOnce)
            {
                var itemCopy = item;
                var charArray = new char[10];
                for (var i = 9; i >= 0; --i)
                {
                    switch (itemCopy & 3)
                    {
                        case 0:
                            charArray[i] = 'A';
                            break;
                        case 1:
                            charArray[i] = 'C';
                            break;
                        case 2:
                            charArray[i] = 'G';
                            break;
                        case 3:
                            charArray[i] = 'T';
                            break;
                    }
                    itemCopy >>= 2;
                }
                results.Add(new string(charArray));
            }
            return results;
        }
    }
    
  • use std::collections::HashMap;
    
    impl Solution {
        pub fn find_repeated_dna_sequences(s: String) -> Vec<String> {
            let n = s.len();
            let mut res = vec![];
            if n < 10 {
                return res;
            }
            let mut map = HashMap::new();
            for i in 0..=n - 10 {
                let key = &s[i..i + 10];
                if map.contains_key(&key) && *map.get(&key).unwrap() {
                    res.push(key.to_string());
                }
                map.insert(key, !map.contains_key(&key));
            }
            res
        }
    }
    
    

All Problems

All Solutions