Welcome to Subscribe On Youtube
Question
Formatted question description: https://leetcode.ca/all/187.html
The DNA sequence is composed of a series of nucleotides abbreviated as 'A'
, 'C'
, 'G'
, and 'T'
.
- For example,
"ACGAATTCCG"
is a DNA sequence.
When studying DNA, it is useful to identify repeated sequences within the DNA.
Given a string s
that represents a DNA sequence, return all the 10
-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.
Example 1:
Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT" Output: ["AAAAACCCCC","CCCCCAAAAA"]
Example 2:
Input: s = "AAAAAAAAAAAAA" Output: ["AAAAAAAAAA"]
Constraints:
1 <= s.length <= 105
s[i]
is either'A'
,'C'
,'G'
, or'T'
.
Algorithm
Using HashSet
can have the characteristics of duplicate items.
A => binary: 00
C => binary: 01
G => binary: 10
T => binary: 11
Basically, this questions is more of a compression problem
, how to encode the 10-char string to use less disk space.
- plain char is 8 bits (2 bytes)
- encoding to 2 bits (00,01,10,11)
Code
-
public class Repeated_DNA_Sequences { public static void main(String[] args) { Repeated_DNA_Sequences out = new Repeated_DNA_Sequences(); Solution s = out.new Solution(); System.out.println(s.findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT")); } class Solution_optimize { public List<String> findRepeatedDnaSequences(String s) { Set<Integer> words = new HashSet<>(); // need this to remove duplicates // Input "AAAAAAAAAAAAA", result ["AAAAAAAAAA"] // to avoid result ["AAAAAAAAAA","AAAAAAAAAA","AAAAAAAAAA"] Set<Integer> repteated = new HashSet<>(); List<String> result = new ArrayList<>(); char[] map = new char[26]; //map['A' - 'A'] = 0; // binary: 00 map['C' - 'A'] = 1; // binary: 01 map['G' - 'A'] = 2; // binary: 10 map['T' - 'A'] = 3; // binary: 11 for(int i = 0; i < s.length() - 9; i++) { int v = 0; for(int j = i; j < i + 10; j++) { v <<= 2; // every time, use the new 2 bits after shifting for current char v |= map[s.charAt(j) - 'A']; } if(words.contains(v) && !repteated.contains(v)) { repteated.add(v); result.add(s.substring(i, i + 10)); } words.add(v); // re-add, if already contains } return result; } } class Solution { HashSet<String> existed = new HashSet<>(); HashSet<String> result = new HashSet<>(); public List<String> findRepeatedDnaSequences(String s) { if (s.length() <= 10) { return new ArrayList<String>(); } for (int i = 0; i < 10; i++) { for (int j = i; j + 9 < s.length(); j += 10) { String one = s.substring(j, j + 10); if (existed.contains(one)) { result.add(one); } existed.add(one); } } return new ArrayList<String>(result); } } } ############ class Solution { public List<String> findRepeatedDnaSequences(String s) { int n = s.length() - 10; Map<String, Integer> cnt = new HashMap<>(); List<String> ans = new ArrayList<>(); for (int i = 0; i <= n; ++i) { String sub = s.substring(i, i + 10); cnt.put(sub, cnt.getOrDefault(sub, 0) + 1); if (cnt.get(sub) == 2) { ans.add(sub); } } return ans; } }
-
class Solution { public: vector<string> findRepeatedDnaSequences(string s) { map<string, int> cnt; int n = s.size() - 10; vector<string> ans; for (int i = 0; i <= n; ++i) { string sub = s.substr(i, 10); if (++cnt[sub] == 2) { ans.push_back(sub); } } return ans; } };
-
class Solution: def findRepeatedDnaSequences(self, s: str) -> List[str]: words = set() # for encoded words repeated = set() # for encoded words result = [] # not encoded words # A => binary: 00 # C => binary: 01 # G => binary: 10 # T => binary: 11 mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3} for i in range(len(s) - 9): v = 0 for j in range(i, i + 10): # every time, use the new 2 bits after shifting for current char v <<= 2 v |= mapping[s[j]] if v in words and v not in repeated: repeated.add(v) result.append(s[i:i + 10]) words.add(v) return result class Solution: def findRepeatedDnaSequences(self, s: str) -> List[str]: n = len(s) - 10 cnt = Counter() ans = [] for i in range(n + 1): sub = s[i : i + 10] cnt[sub] += 1 if cnt[sub] == 2: ans.append(sub) return ans ############ class Solution(object): def findRepeatedDnaSequences(self, s): """ :type s: str :rtype: List[str] """ d = {} ans = [] for i in range(len(s) - 9): key = s[i:i + 10] if key in d: d[key] += 1 if d[key] == 2: ans.append(key) else: d[key] = 1 return ans
-
func findRepeatedDnaSequences(s string) []string { hashCode := map[byte]int{'A': 0, 'C': 1, 'G': 2, 'T': 3} ans, cnt, left, right := []string{}, map[int]int{}, 0, 0 sha, multi := 0, int(math.Pow(4, 9)) for ; right < len(s); right++ { sha = sha*4 + hashCode[s[right]] if right-left+1 < 10 { continue } cnt[sha]++ if cnt[sha] == 2 { ans = append(ans, s[left:right+1]) } sha, left = sha-multi*hashCode[s[left]], left+1 } return ans }
-
function findRepeatedDnaSequences(s: string): string[] { const n = s.length; const map = new Map<string, boolean>(); const res = []; for (let i = 0; i <= n - 10; i++) { const key = s.slice(i, i + 10); if (map.has(key) && map.get(key)) { res.push(key); } map.set(key, !map.has(key)); } return res; }
-
/** * @param {string} s * @return {string[]} */ var findRepeatedDnaSequences = function (s) { const n = s.length - 10; let cnt = new Map(); let ans = []; for (let i = 0; i <= n; ++i) { let sub = s.slice(i, i + 10); cnt[sub] = (cnt[sub] || 0) + 1; if (cnt[sub] == 2) { ans.push(sub); } } return ans; };
-
using System.Collections.Generic; public class Solution { public IList<string> FindRepeatedDnaSequences(string s) { var once = new HashSet<int>(); var moreThanOnce = new HashSet<int>(); int bits = 0; for (var i = 0; i < s.Length; ++i) { bits <<= 2; switch (s[i]) { case 'A': break; case 'C': bits |= 1; break; case 'G': bits |= 2; break; case 'T': bits |= 3; break; } if (i >= 10) { bits &= 0xFFFFF; } if (i >= 9 && !once.Add(bits)) { moreThanOnce.Add(bits); } } var results = new List<string>(); foreach (var item in moreThanOnce) { var itemCopy = item; var charArray = new char[10]; for (var i = 9; i >= 0; --i) { switch (itemCopy & 3) { case 0: charArray[i] = 'A'; break; case 1: charArray[i] = 'C'; break; case 2: charArray[i] = 'G'; break; case 3: charArray[i] = 'T'; break; } itemCopy >>= 2; } results.Add(new string(charArray)); } return results; } }
-
use std::collections::HashMap; impl Solution { pub fn find_repeated_dna_sequences(s: String) -> Vec<String> { let n = s.len(); let mut res = vec![]; if n < 10 { return res; } let mut map = HashMap::new(); for i in 0..=n - 10 { let key = &s[i..i + 10]; if map.contains_key(&key) && *map.get(&key).unwrap() { res.push(key.to_string()); } map.insert(key, !map.contains_key(&key)); } res } }