# Question

Formatted question description: https://leetcode.ca/all/187.html

The DNA sequence is composed of a series of nucleotides abbreviated as 'A', 'C', 'G', and 'T'.

• For example, "ACGAATTCCG" is a DNA sequence.

When studying DNA, it is useful to identify repeated sequences within the DNA.

Given a string s that represents a DNA sequence, return all the 10-letter-long sequences (substrings) that occur more than once in a DNA molecule. You may return the answer in any order.

Example 1:

Input: s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
Output: ["AAAAACCCCC","CCCCCAAAAA"]


Example 2:

Input: s = "AAAAAAAAAAAAA"
Output: ["AAAAAAAAAA"]


Constraints:

• 1 <= s.length <= 105
• s[i] is either 'A', 'C', 'G', or 'T'.

# Algorithm

Using HashSet can have the characteristics of duplicate items.

A => binary: 00
C => binary: 01
G => binary: 10
T => binary: 11



Basically, this questions is more of a compression problem, how to encode the 10-char string to use less disk space.

• plain char is 8 bits (2 bytes)
• encoding to 2 bits (00,01,10,11)

# Code

• 
public class Repeated_DNA_Sequences {

public static void main(String[] args) {
Repeated_DNA_Sequences out = new Repeated_DNA_Sequences();
Solution s = out.new Solution();

System.out.println(s.findRepeatedDnaSequences("AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"));
}

class Solution_optimize {
public List<String> findRepeatedDnaSequences(String s) {
Set<Integer> words = new HashSet<>();

// need this to remove duplicates
// Input "AAAAAAAAAAAAA", result ["AAAAAAAAAA"]
// to avoid result ["AAAAAAAAAA","AAAAAAAAAA","AAAAAAAAAA"]
Set<Integer> repteated = new HashSet<>();
List<String> result = new ArrayList<>();

char[] map = new char[26];
//map['A' - 'A'] = 0; // binary: 00
map['C' - 'A'] = 1; // binary: 01
map['G' - 'A'] = 2; // binary: 10
map['T' - 'A'] = 3; // binary: 11

for(int i = 0; i < s.length() - 9; i++) {
int v = 0;
for(int j = i; j < i + 10; j++) {
v <<= 2; // every time, use the new 2 bits after shifting for current char
v |= map[s.charAt(j) - 'A'];
}
if(words.contains(v) && !repteated.contains(v)) {
}

}
return result;
}
}

class Solution {
HashSet<String> existed = new HashSet<>();
HashSet<String> result = new HashSet<>();

public List<String> findRepeatedDnaSequences(String s) {

if (s.length() <= 10) {
return new ArrayList<String>();
}

for (int i = 0; i < 10; i++) {
for (int j = i; j + 9 < s.length(); j += 10) {

String one = s.substring(j, j + 10);

if (existed.contains(one)) {
}

}
}

return new ArrayList<String>(result);
}

}
}

############

class Solution {
public List<String> findRepeatedDnaSequences(String s) {
int n = s.length() - 10;
Map<String, Integer> cnt = new HashMap<>();
List<String> ans = new ArrayList<>();
for (int i = 0; i <= n; ++i) {
String sub = s.substring(i, i + 10);
cnt.put(sub, cnt.getOrDefault(sub, 0) + 1);
if (cnt.get(sub) == 2) {
}
}
return ans;
}
}

• class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
map<string, int> cnt;
int n = s.size() - 10;
vector<string> ans;
for (int i = 0; i <= n; ++i) {
string sub = s.substr(i, 10);
if (++cnt[sub] == 2) {
ans.push_back(sub);
}
}
return ans;
}
};

• class Solution:
def findRepeatedDnaSequences(self, s: str) -> List[str]:
words = set() # for encoded words
repeated = set() # for encoded words
result = [] # not encoded words

# A => binary: 00
# C => binary: 01
# G => binary: 10
# T => binary: 11
mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

for i in range(len(s) - 9):
v = 0
for j in range(i, i + 10):
# every time, use the new 2 bits after shifting for current char
v <<= 2
v |= mapping[s[j]]
if v in words and v not in repeated:
result.append(s[i:i + 10])

return result

class Solution:
def findRepeatedDnaSequences(self, s: str) -> List[str]:
n = len(s) - 10
cnt = Counter()
ans = []
for i in range(n + 1):
sub = s[i : i + 10]
cnt[sub] += 1
if cnt[sub] == 2:
ans.append(sub)
return ans

############

class Solution(object):
def findRepeatedDnaSequences(self, s):
"""
:type s: str
:rtype: List[str]
"""
d = {}
ans = []
for i in range(len(s) - 9):
key = s[i:i + 10]
if key in d:
d[key] += 1
if d[key] == 2:
ans.append(key)
else:
d[key] = 1
return ans


• func findRepeatedDnaSequences(s string) []string {
hashCode := map[byte]int{'A': 0, 'C': 1, 'G': 2, 'T': 3}
ans, cnt, left, right := []string{}, map[int]int{}, 0, 0

sha, multi := 0, int(math.Pow(4, 9))
for ; right < len(s); right++ {
sha = sha*4 + hashCode[s[right]]
if right-left+1 < 10 {
continue
}
cnt[sha]++
if cnt[sha] == 2 {
ans = append(ans, s[left:right+1])
}
sha, left = sha-multi*hashCode[s[left]], left+1
}
return ans
}

• function findRepeatedDnaSequences(s: string): string[] {
const n = s.length;
const map = new Map<string, boolean>();
const res = [];
for (let i = 0; i <= n - 10; i++) {
const key = s.slice(i, i + 10);
if (map.has(key) && map.get(key)) {
res.push(key);
}
map.set(key, !map.has(key));
}
return res;
}


• /**
* @param {string} s
* @return {string[]}
*/
var findRepeatedDnaSequences = function (s) {
const n = s.length - 10;
let cnt = new Map();
let ans = [];
for (let i = 0; i <= n; ++i) {
let sub = s.slice(i, i + 10);
cnt[sub] = (cnt[sub] || 0) + 1;
if (cnt[sub] == 2) {
ans.push(sub);
}
}
return ans;
};


• using System.Collections.Generic;

public class Solution {
public IList<string> FindRepeatedDnaSequences(string s) {
var once = new HashSet<int>();
var moreThanOnce = new HashSet<int>();
int bits = 0;
for (var i = 0; i < s.Length; ++i)
{
bits <<= 2;
switch (s[i])
{
case 'A':
break;
case 'C':
bits |= 1;
break;
case 'G':
bits |= 2;
break;
case 'T':
bits |= 3;
break;
}
if (i >= 10)
{
bits &= 0xFFFFF;
}
if (i >= 9 && !once.Add(bits))
{
}
}

var results = new List<string>();
foreach (var item in moreThanOnce)
{
var itemCopy = item;
var charArray = new char[10];
for (var i = 9; i >= 0; --i)
{
switch (itemCopy & 3)
{
case 0:
charArray[i] = 'A';
break;
case 1:
charArray[i] = 'C';
break;
case 2:
charArray[i] = 'G';
break;
case 3:
charArray[i] = 'T';
break;
}
itemCopy >>= 2;
}
}
return results;
}
}

• use std::collections::HashMap;

impl Solution {
pub fn find_repeated_dna_sequences(s: String) -> Vec<String> {
let n = s.len();
let mut res = vec![];
if n < 10 {
return res;
}
let mut map = HashMap::new();
for i in 0..=n - 10 {
let key = &s[i..i + 10];
if map.contains_key(&key) && *map.get(&key).unwrap() {
res.push(key.to_string());
}
map.insert(key, !map.contains_key(&key));
}
res
}
}