Multiple sequence alignment in Java
(The entire project lives here.)
Problem definition
Suppose we are given three genomic strings drawn from the alphabet of 20 amino acids:
ACGH
CFG
EAC
In multiple sequence alignment problem we want to align them optimally in order to access their evolutionary similarity. Such an alignment may look like the following:
-AC-GH
--CFG-
EAC---
Solution
Often, the problem is cast as a pathfinding problem in $k$-dimensional lattice, where $k$ is the number of strings. Here is my program, it compares A* with Dijkstra's algorithms:
MultipleSequenceAlignmentInstance.java
package net.coderodde.bio.msa;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
public final class MultipleSequenceAlignmentInstance {
/**
* The penalty for pairs in which there is one valid character and one gap.
*/
private final int gapPenalty;
/**
* The character pairs cost matrix.
*/
private final CostMatrix<Integer> costMatrix;
/**
* The array of sequences to be aligned.
*/
private final String sequenceArray;
// A small speed optimization:
private final char column;
public MultipleSequenceAlignmentInstance(CostMatrix<Integer> costMatrix,
int gapPenalty,
String... sequenceArray) {
this.costMatrix = Objects.requireNonNull(costMatrix,
"Cost matrix is null");
this.gapPenalty = gapPenalty;
this.sequenceArray = sequenceArray.clone();
this.column = new char[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
this.sequenceArray[i] =
checkIsValidGenomicSequence(sequenceArray[i]);
}
}
public Alignment align() {
long start = System.currentTimeMillis();
HeuristicFunction hf = new HeuristicFunctionComputer()
.computeHeuristicFunction(this);
long end = System.currentTimeMillis();
System.out.println("Computed heuristic function in " + (end - start) +
" milliseconds.");
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode,
tentativeCost +
hf.get(childNode)));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
public Alignment alignBrute() {
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode, tentativeCost));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
/**
* Create the source node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to zero.
*
* @return the source node.
*/
LatticeNode getSourceNode() {
int sourceCoordinates = new int[sequenceArray.length];
return new LatticeNode(this, sourceCoordinates);
}
/**
* Create the target node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to the respective sequence lengths.
*
* @return the target node.
*/
LatticeNode getTargetNode() {
int targetCoordinates = new int[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
targetCoordinates[i] = sequenceArray[i].length();
}
return new LatticeNode(this, targetCoordinates);
}
Integer getWeight(LatticeNode tail,
LatticeNode head,
int dimension1,
int dimension2) {
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
if (tailCoordinates[dimension1] == headCoordinates[dimension1]) {
//System.out.println("1");
return gapPenalty;
} else if (tailCoordinates[dimension2] == headCoordinates[dimension2]) {
//System.out.println("2");
return gapPenalty;
} else {
// System.out.println("3");
char character1 = sequenceArray[dimension1]
.charAt(tailCoordinates[dimension1]);
char character2 = sequenceArray[dimension2]
.charAt(tailCoordinates[dimension2]);
return costMatrix.getCost(character1, character2);
}
}
Integer getWeight(LatticeNode tail, LatticeNode head) {
// Extract the column represented by taking a single hop from 'tail' to
// 'head':
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int i = 0; i < sequenceArray.length; ++i) {
if (tailCoordinates[i] + 1 == headCoordinates[i]) {
column[i] = sequenceArray[i].charAt(tailCoordinates[i]);
} else {
column[i] = AminoAcidAlphabet.GAP_CHARACTER;
}
}
// Compute the hop cost as the sum of pairwise hops in any plane:
int cost = 0;
for (int i = 0; i < column.length; ++i) {
char character1 = column[i];
for (int j = i + 1; j < column.length; ++j) {
char character2 = column[j];
if (character1 != AminoAcidAlphabet.GAP_CHARACTER) {
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += costMatrix.getCost(character1, character2);
} else {
cost += gapPenalty;
}
} else {
// character1 IS the gap character:
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += gapPenalty;
} else {
// Do nothing since we have a pair (gap, gap).
}
}
}
}
return cost;
}
String getSequenceArray() {
return sequenceArray;
}
private Alignment tracebackPath(Map<LatticeNode, LatticeNode> parents,
Integer cost) {
List<LatticeNode> path = new ArrayList<>();
LatticeNode node = getTargetNode();
while (node != null) {
path.add(node);
node = parents.get(node);
}
Collections.<LatticeNode>reverse(path);
String strings = new String[getSequenceArray().length];
StringBuilder stringBuilders = new StringBuilder[strings.length];
for (int i = 0; i < stringBuilders.length; ++i) {
stringBuilders[i] = new StringBuilder();
}
for (int i = 1; i < path.size(); ++i) {
LatticeNode tail = path.get(i - 1);
LatticeNode head = path.get(i);
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int j = 0; j < tailCoordinates.length; ++j) {
if (tailCoordinates[j] != headCoordinates[j]) {
stringBuilders[j].
append(sequenceArray[j].charAt(tailCoordinates[j]));
} else {
stringBuilders[j].append(AminoAcidAlphabet.GAP_CHARACTER);
}
}
}
for (int i = 0; i < strings.length; ++i) {
strings[i] = stringBuilders[i].toString();
}
return new Alignment(strings, cost);
}
private String checkIsValidGenomicSequence(String string) {
Set<Character> characterSet =
AminoAcidAlphabet.getAminoAcidAlphabet()
.getCharacterSet();
for (char c : string.toCharArray()) {
if (!characterSet.contains(c)) {
throw new IllegalArgumentException("Unknown amino acid: " + c);
}
}
return string;
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final Integer fScore;
private final LatticeNode node;
LatticeNodeHolder(LatticeNode node, Integer fScore) {
this.fScore = fScore;
this.node = node;
}
LatticeNode getNode() {
return node;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(fScore, o.fScore);
}
}
}
HeuristicFunctionComputer.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
final class HeuristicFunctionComputer {
HeuristicFunction computeHeuristicFunction(
MultipleSequenceAlignmentInstance instance) {
String sequenceArray = instance.getSequenceArray();
HeuristicFunction heuristicFunction = new HeuristicFunction(instance);
for (int dimension1 = 0;
dimension1 < sequenceArray.length;
dimension1++) {
for (int dimension2 = dimension1 + 1;
dimension2 < sequenceArray.length;
dimension2++) {
loadPartialHeuristicFunction(heuristicFunction,
dimension1,
dimension2,
instance);
}
}
return heuristicFunction;
}
// Basically, this method runs Dijkstra backwards from the target node until
// the entire 2D-grid is explored.
private void loadPartialHeuristicFunction(
HeuristicFunction heuristicFunction,
int dimension1,
int dimension2,
MultipleSequenceAlignmentInstance instance) {
LatticeNode target = instance.getTargetNode();
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Set<LatticeNode> closed = new HashSet<>();
open.add(new LatticeNodeHolder(target, 0));
Point point = new Point();
while (!open.isEmpty()) {
LatticeNodeHolder currentNodeHolder = open.remove();
LatticeNode currentNode = currentNodeHolder.getLatticeNode();
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
Integer currentNodeCost = currentNodeHolder.getCost();
extractPoint(point, currentNode, dimension1, dimension2);
heuristicFunction.put(dimension1,
dimension2,
new Point(point),
currentNodeCost);
for (LatticeNode parent : currentNode.getParents()) {
if (closed.contains(parent)) {
continue;
}
int tentativeCost =
currentNodeCost + instance.getWeight(parent,
currentNode,
dimension1,
dimension2);
extractPoint(point, parent, dimension1, dimension2);
if (!heuristicFunction.containsPartial(dimension1,
dimension2,
point)
|| heuristicFunction.getPartial(dimension1,
dimension2,
point)
> tentativeCost) {
open.add(new LatticeNodeHolder(parent, tentativeCost));
}
}
}
}
private static void extractPoint(Point point,
LatticeNode latticeNode,
int i,
int j) {
int coordinates = latticeNode.getCoordinates();
point.x = coordinates[i];
point.y = coordinates[j];
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final LatticeNode node;
private final Integer cost;
LatticeNodeHolder(LatticeNode node, Integer cost) {
this.node = node;
this.cost = cost;
}
LatticeNode getLatticeNode() {
return node;
}
Integer getCost() {
return cost;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(cost, o.cost);
}
}
}
LatticeNode.java
package net.coderodde.bio.msa;
import com.sun.corba.se.spi.orb.OperationFactory;
import java.util.Arrays;
final class LatticeNode {
/**
* The problem instance this lattice node contributes to.
*/
private final MultipleSequenceAlignmentInstance instance;
/**
* The coordinates of this node in the entire lattice.
*/
private final int coordinates;
LatticeNode(MultipleSequenceAlignmentInstance instance, int coordinates) {
this.instance = instance;
this.coordinates = coordinates;
}
@Override
public boolean equals(Object o) {
return Arrays.equals(coordinates, ((LatticeNode) o).coordinates);
}
// Generated by NetBeans 8.1:
@Override
public int hashCode() {
int hash = 7;
hash = 41 * hash + Arrays.hashCode(this.coordinates);
return hash;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[");
String separator = "";
for (int coordinate : coordinates) {
sb.append(separator).append(coordinate);
separator = ", ";
}
return sb.append("]").toString();
}
LatticeNode getChildren() {
// Find out in how many dimension we can move forward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] < sequenceArray[i].length()) {
dimensionsNotReached++;
// We can make a step forward in the direction of ith dimension:
inclusionMap[i] = true;
}
}
// Create the array of children:
int numberOfChildren = pow2(dimensionsNotReached) - 1;
LatticeNode children = new LatticeNode[numberOfChildren];
loadChildren(children, inclusionMap);
// Convert offsets to actual child coordinates:
for (LatticeNode child : children) {
child.addOffsets(coordinates);
}
return children;
}
LatticeNode getParents() {
// Find out in how many dimensions we can move BACKward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] > 0) {
dimensionsNotReached++;
// We can make a step backwards in the direction of ith
// dimension:
inclusionMap[i] = true;
}
}
// Create the array of parents:
int numberOfParents = pow2(dimensionsNotReached) - 1;
LatticeNode parents = new LatticeNode[numberOfParents];
loadParents(parents, inclusionMap);
// Convert the offsets to actual parent coordinates:
for (LatticeNode parent : parents) {
parent.addOffsets(coordinates);
}
return parents;
}
int getCoordinates() {
return coordinates;
}
private void loadChildren(LatticeNode children, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != children.length; ++i) {
increment(coords, inclusionMap);
children[i] = new LatticeNode(instance, coords.clone());
}
}
private void loadParents(LatticeNode parents, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != parents.length; ++i) {
decrement(coords, inclusionMap);
parents[i] = new LatticeNode(instance, coords.clone());
}
}
private static void increment(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = 1;
return;
}
coordinates[i] = 0;
}
}
private static void decrement(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = -1;
return;
}
coordinates[i] = 0;
}
}
private void addOffsets(int offsets) {
for (int i = 0; i < coordinates.length; ++i) {
coordinates[i] += offsets[i];
}
}
private static int pow2(int exponent) {
int ret = 1;
for (int e = 0; e < exponent; ++e) {
ret *= 2;
}
return ret;
}
}
PAM250CostMatrix.java
package net.coderodde.bio.msa;
import java.util.HashMap;
import java.util.Map;
import net.coderodde.bio.msa.AminoAcidAlphabet;
import net.coderodde.bio.msa.CostMatrix;
public final class PAM250CostMatrix implements CostMatrix<Integer> {
private static PAM250CostMatrix instance;
private final Map<Character, Map<Character, Integer>> m = new HashMap<>();
public static PAM250CostMatrix getPAM250CostMatrix() {
if (instance == null) {
instance = new PAM250CostMatrix();
}
return instance;
}
private PAM250CostMatrix() {
AminoAcidAlphabet alphabet = AminoAcidAlphabet.getAminoAcidAlphabet();
alphabet.getCharacterSet().stream().forEach((character) -> {
m.put(character, new HashMap<>());
});
m.get('A').put('A', -2);
m.get('R').put('A', 2);
m.get('A').put('R', 2);
m.get('R').put('R', -6);
m.get('N').put('A', 0);
m.get('A').put('N', 0);
m.get('N').put('R', 0);
m.get('R').put('N', 0);
m.get('N').put('N', -2);
m.get('D').put('A', 0);
m.get('A').put('D', 0);
m.get('D').put('R', 1);
m.get('R').put('D', 1);
m.get('D').put('N', -2);
m.get('N').put('D', -2);
m.get('D').put('D', -4);
m.get('C').put('A', 2);
m.get('A').put('C', 2);
m.get('C').put('R', 4);
m.get('R').put('C', 4);
m.get('C').put('N', 4);
m.get('N').put('C', 4);
m.get('C').put('D', 5);
m.get('D').put('C', 5);
m.get('C').put('C', -4);
m.get('Q').put('A', 0);
m.get('A').put('Q', 0);
m.get('Q').put('R', -1);
m.get('R').put('Q', -1);
m.get('Q').put('N', -1);
m.get('N').put('Q', -1);
m.get('Q').put('D', -2);
m.get('D').put('Q', -2);
m.get('Q').put('C', 5);
m.get('C').put('Q', 5);
m.get('Q').put('Q', -4);
m.get('E').put('A', 0);
m.get('A').put('E', 0);
m.get('E').put('R', 1);
m.get('R').put('E', 1);
m.get('E').put('N', -1);
m.get('N').put('E', -1);
m.get('E').put('D', -3);
m.get('D').put('E', -3);
m.get('E').put('C', 5);
m.get('C').put('E', 5);
m.get('E').put('Q', -2);
m.get('Q').put('E', -2);
m.get('E').put('E', -4);
m.get('G').put('A', -1);
m.get('A').put('G', -1);
m.get('G').put('R', 3);
m.get('R').put('G', 3);
m.get('G').put('N', 0);
m.get('N').put('G', 0);
m.get('G').put('D', -1);
m.get('D').put('G', -1);
m.get('G').put('C', 3);
m.get('C').put('G', 3);
m.get('G').put('Q', 1);
m.get('Q').put('G', 1);
m.get('G').put('E', 0);
m.get('E').put('G', 0);
m.get('G').put('G', -5);
m.get('H').put('A', 1);
m.get('A').put('H', 1);
m.get('H').put('R', -2);
m.get('R').put('H', -2);
m.get('H').put('N', -2);
m.get('N').put('H', -2);
m.get('H').put('D', -1);
m.get('D').put('H', -1);
m.get('H').put('C', 3);
m.get('C').put('H', 3);
m.get('H').put('Q', -3);
m.get('Q').put('H', -3);
m.get('H').put('E', -1);
m.get('E').put('H', -1);
m.get('H').put('G', 2);
m.get('G').put('H', 2);
m.get('H').put('H', -6);
m.get('I').put('A', 1);
m.get('A').put('I', 1);
m.get('I').put('R', 2);
m.get('R').put('I', 2);
m.get('I').put('N', 2);
m.get('N').put('I', 2);
m.get('I').put('D', 2);
m.get('D').put('I', 2);
m.get('I').put('C', 2);
m.get('C').put('I', 2);
m.get('I').put('Q', 2);
m.get('Q').put('I', 2);
m.get('I').put('E', 2);
m.get('E').put('I', 2);
m.get('I').put('G', 3);
m.get('G').put('I', 3);
m.get('I').put('H', 2);
m.get('H').put('I', 2);
m.get('I').put('I', -5);
m.get('L').put('A', 2);
m.get('A').put('L', 2);
m.get('L').put('R', 3);
m.get('R').put('L', 3);
m.get('L').put('N', 3);
m.get('N').put('L', 3);
m.get('L').put('D', 4);
m.get('D').put('L', 4);
m.get('L').put('C', 6);
m.get('C').put('L', 6);
m.get('L').put('Q', 2);
m.get('Q').put('L', 2);
m.get('L').put('E', 3);
m.get('E').put('L', 3);
m.get('L').put('G', 4);
m.get('G').put('L', 4);
m.get('L').put('H', 2);
m.get('H').put('L', 2);
m.get('L').put('I', -2);
m.get('I').put('L', -2);
m.get('L').put('L', -6);
m.get('K').put('A', 1);
m.get('A').put('K', 1);
m.get('K').put('R', -3);
m.get('R').put('K', -3);
m.get('K').put('N', -1);
m.get('N').put('K', -1);
m.get('K').put('D', 0);
m.get('D').put('K', 0);
m.get('K').put('C', 5);
m.get('C').put('K', 5);
m.get('K').put('Q', -1);
m.get('Q').put('K', -1);
m.get('K').put('E', 0);
m.get('E').put('K', 0);
m.get('K').put('G', 2);
m.get('G').put('K', 2);
m.get('K').put('H', 0);
m.get('H').put('K', 0);
m.get('K').put('I', 2);
m.get('I').put('K', 2);
m.get('K').put('L', 3);
m.get('L').put('K', 3);
m.get('K').put('K', -5);
m.get('M').put('A', 1);
m.get('A').put('M', 1);
m.get('M').put('R', 0);
m.get('R').put('M', 0);
m.get('M').put('N', 2);
m.get('N').put('M', 2);
m.get('M').put('D', 3);
m.get('D').put('M', 3);
m.get('M').put('C', 5);
m.get('C').put('M', 5);
m.get('M').put('Q', 1);
m.get('Q').put('M', 1);
m.get('M').put('E', 2);
m.get('E').put('M', 2);
m.get('M').put('G', 3);
m.get('G').put('M', 3);
m.get('M').put('H', 2);
m.get('H').put('M', 2);
m.get('M').put('I', -2);
m.get('I').put('M', -2);
m.get('M').put('L', -4);
m.get('L').put('M', -4);
m.get('M').put('K', 0);
m.get('K').put('M', 0);
m.get('M').put('M', -6);
m.get('F').put('A', 4);
m.get('A').put('F', 4);
m.get('F').put('R', 4);
m.get('R').put('F', 4);
m.get('F').put('N', 4);
m.get('N').put('F', 4);
m.get('F').put('D', 6);
m.get('D').put('F', 6);
m.get('F').put('C', 4);
m.get('C').put('F', 4);
m.get('F').put('Q', 5);
m.get('Q').put('F', 5);
m.get('F').put('E', 5);
m.get('E').put('F', 5);
m.get('F').put('G', 5);
m.get('G').put('F', 5);
m.get('F').put('H', 2);
m.get('H').put('F', 2);
m.get('F').put('I', -1);
m.get('I').put('F', -1);
m.get('F').put('L', -2);
m.get('L').put('F', -2);
m.get('F').put('K', 5);
m.get('K').put('F', 5);
m.get('F').put('M', 0);
m.get('M').put('F', 0);
m.get('F').put('F', -9);
m.get('P').put('A', -1);
m.get('A').put('P', -1);
m.get('P').put('R', 0);
m.get('R').put('P', 0);
m.get('P').put('N', 1);
m.get('N').put('P', 1);
m.get('P').put('D', 1);
m.get('D').put('P', 1);
m.get('P').put('C', 3);
m.get('C').put('P', 3);
m.get('P').put('Q', 0);
m.get('Q').put('P', 0);
m.get('P').put('E', 1);
m.get('E').put('P', 1);
m.get('P').put('G', 1);
m.get('G').put('P', 1);
m.get('P').put('H', 0);
m.get('H').put('P', 0);
m.get('P').put('I', 2);
m.get('I').put('P', 2);
m.get('P').put('L', 3);
m.get('L').put('P', 3);
m.get('P').put('K', 1);
m.get('K').put('P', 1);
m.get('P').put('M', 2);
m.get('M').put('P', 2);
m.get('P').put('F', 5);
m.get('F').put('P', 5);
m.get('P').put('P', -6);
m.get('S').put('A', -1);
m.get('A').put('S', -1);
m.get('S').put('R', 0);
m.get('R').put('S', 0);
m.get('S').put('N', -1);
m.get('N').put('S', -1);
m.get('S').put('D', 0);
m.get('D').put('S', 0);
m.get('S').put('C', 0);
m.get('C').put('S', 0);
m.get('S').put('Q', 1);
m.get('Q').put('S', 1);
m.get('S').put('E', 0);
m.get('E').put('S', 0);
m.get('S').put('G', -1);
m.get('G').put('S', -1);
m.get('S').put('H', 1);
m.get('H').put('S', 1);
m.get('S').put('I', 1);
m.get('I').put('S', 1);
m.get('S').put('L', 3);
m.get('L').put('S', 3);
m.get('S').put('K', 0);
m.get('K').put('S', 0);
m.get('S').put('M', 2);
m.get('M').put('S', 2);
m.get('S').put('F', 3);
m.get('F').put('S', 3);
m.get('S').put('P', -1);
m.get('P').put('S', -1);
m.get('S').put('S', -3);
m.get('T').put('A', -1);
m.get('A').put('T', -1);
m.get('T').put('R', 1);
m.get('R').put('T', 1);
m.get('T').put('N', 0);
m.get('N').put('T', 0);
m.get('T').put('D', 0);
m.get('D').put('T', 0);
m.get('T').put('C', 2);
m.get('C').put('T', 2);
m.get('T').put('Q', 1);
m.get('Q').put('T', 1);
m.get('T').put('E', 0);
m.get('E').put('T', 0);
m.get('T').put('G', 0);
m.get('G').put('T', 0);
m.get('T').put('H', 1);
m.get('H').put('T', 1);
m.get('T').put('I', 0);
m.get('I').put('T', 0);
m.get('T').put('L', 2);
m.get('L').put('T', 2);
m.get('T').put('K', 0);
m.get('K').put('T', 0);
m.get('T').put('M', 1);
m.get('M').put('T', 1);
m.get('T').put('F', 2);
m.get('F').put('T', 2);
m.get('T').put('P', 0);
m.get('P').put('T', 0);
m.get('T').put('S', -1);
m.get('S').put('T', -1);
m.get('T').put('T', -3);
m.get('W').put('A', 6);
m.get('A').put('W', 6);
m.get('W').put('R', -2);
m.get('R').put('W', -2);
m.get('W').put('N', 4);
m.get('N').put('W', 4);
m.get('W').put('D', 7);
m.get('D').put('W', 7);
m.get('W').put('C', 8);
m.get('C').put('W', 8);
m.get('W').put('Q', 5);
m.get('Q').put('W', 5);
m.get('W').put('E', 7);
m.get('E').put('W', 7);
m.get('W').put('G', 7);
m.get('G').put('W', 7);
m.get('W').put('H', 3);
m.get('H').put('W', 3);
m.get('W').put('I', 5);
m.get('I').put('W', 5);
m.get('W').put('L', 2);
m.get('L').put('W', 2);
m.get('W').put('K', 3);
m.get('K').put('W', 3);
m.get('W').put('M', 4);
m.get('M').put('W', 4);
m.get('W').put('F', 0);
m.get('F').put('W', 0);
m.get('W').put('P', 6);
m.get('P').put('W', 6);
m.get('W').put('S', 2);
m.get('S').put('W', 2);
m.get('W').put('T', 5);
m.get('T').put('W', 5);
m.get('W').put('W', -17);
m.get('Y').put('A', 3);
m.get('A').put('Y', 3);
m.get('Y').put('R', 4);
m.get('R').put('Y', 4);
m.get('Y').put('N', 2);
m.get('N').put('Y', 2);
m.get('Y').put('D', 4);
m.get('D').put('Y', 4);
m.get('Y').put('C', 0);
m.get('C').put('Y', 0);
m.get('Y').put('Q', 4);
m.get('Q').put('Y', 4);
m.get('Y').put('E', 4);
m.get('E').put('Y', 4);
m.get('Y').put('G', 5);
m.get('G').put('Y', 5);
m.get('Y').put('H', 0);
m.get('H').put('Y', 0);
m.get('Y').put('I', 1);
m.get('I').put('Y', 1);
m.get('Y').put('L', 1);
m.get('L').put('Y', 1);
m.get('Y').put('K', 4);
m.get('K').put('Y', 4);
m.get('Y').put('M', 2);
m.get('M').put('Y', 2);
m.get('Y').put('F', -7);
m.get('F').put('Y', -7);
m.get('Y').put('P', 5);
m.get('P').put('Y', 5);
m.get('Y').put('S', 3);
m.get('S').put('Y', 3);
m.get('Y').put('T', 3);
m.get('T').put('Y', 3);
m.get('Y').put('W', 0);
m.get('W').put('Y', 0);
m.get('Y').put('Y', -10);
m.get('V').put('A', 0);
m.get('A').put('V', 0);
m.get('V').put('R', 2);
m.get('R').put('V', 2);
m.get('V').put('N', 2);
m.get('N').put('V', 2);
m.get('V').put('D', 2);
m.get('D').put('V', 2);
m.get('V').put('C', 2);
m.get('C').put('V', 2);
m.get('V').put('Q', 2);
m.get('Q').put('V', 2);
m.get('V').put('E', 2);
m.get('E').put('V', 2);
m.get('V').put('G', 1);
m.get('G').put('V', 1);
m.get('V').put('H', 2);
m.get('H').put('V', 2);
m.get('V').put('I', -4);
m.get('I').put('V', -4);
m.get('V').put('L', -2);
m.get('L').put('V', -2);
m.get('V').put('K', 2);
m.get('K').put('V', 2);
m.get('V').put('M', -2);
m.get('M').put('V', -2);
m.get('V').put('F', 1);
m.get('F').put('V', 1);
m.get('V').put('P', 1);
m.get('P').put('V', 1);
m.get('V').put('S', 1);
m.get('S').put('V', 1);
m.get('V').put('T', 0);
m.get('T').put('V', 0);
m.get('V').put('W', 6);
m.get('W').put('V', 6);
m.get('V').put('Y', 2);
m.get('Y').put('V', 2);
m.get('V').put('V', -4);
}
@Override
public Integer getCost(Character aminoAcidChar1, Character aminoAcidChar2) {
try {
return m.get(aminoAcidChar1).get(aminoAcidChar2);
} catch (NullPointerException ex) {
throw new IllegalArgumentException("Bad arguments: (" +
aminoAcidChar1 + ", " + aminoAcidChar2 + ")",
ex);
}
}
}
AminoAcidAlphabet.java
package net.coderodde.bio.msa;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
public final class AminoAcidAlphabet {
public static final Character GAP_CHARACTER = '-';
private static AminoAcidAlphabet instance;
private final Set<Character> alphabet = new HashSet<>();
public static AminoAcidAlphabet getAminoAcidAlphabet() {
if (instance == null) {
instance = new AminoAcidAlphabet();
}
return instance;
}
private AminoAcidAlphabet() {
String aminoAcids = "ACDEF" +
"GHIKL" +
"MNPQR" +
"STVWY";
for (char c : aminoAcids.toCharArray()) {
alphabet.add(c);
}
}
public Set<Character> getCharacterSet() {
return Collections.<Character>unmodifiableSet(alphabet);
}
}
Alignment.java
package net.coderodde.bio.msa;
public class Alignment {
private final String alignment;
private final int cost;
Alignment(String alignment, int cost) {
this.alignment = alignment;
this.cost = cost;
}
public String getAlignemnt() {
return alignment;
}
public int getCost() {
return cost;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String separator = "";
for (String row : alignment) {
sb.append(separator).append(row);
separator = "n";
}
sb.append("nCost: ").append(cost);
return sb.toString();
}
}
CostMatrix.java
package net.coderodde.bio.msa;
public interface CostMatrix<C> {
public C getCost(Character aminoAcidChar1, Character aminoAcidChar2);
}
HeuristicFunction.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashMap;
import java.util.Map;
final class HeuristicFunction {
private final Map<Point, Integer> matrix;
HeuristicFunction(MultipleSequenceAlignmentInstance instance) {
int sequences = instance.getSequenceArray().length;
this.matrix = new Map[sequences];
for (int i = 0; i < sequences; ++i) {
this.matrix[i] = new Map[sequences];
for (int j = i + 1; j < sequences; ++j) {
this.matrix[i][j] = new HashMap<>();
}
}
}
void put(int dimension1, int dimension2, Point point, Integer cost) {
matrix[dimension1][dimension2].put(point, cost);
}
boolean containsPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].containsKey(point);
}
Integer getPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].get(point);
}
Integer get(LatticeNode node) {
int cost = 0;
int coordinates = node.getCoordinates();
Point point = new Point();
for (int dimension1 = 0;
dimension1 < coordinates.length;
dimension1++) {
point.x = dimension1;
for (int dimension2 = dimension1 + 1;
dimension2 < coordinates.length;
dimension2++) {
point.y = dimension2;
cost += matrix[dimension1][dimension2].get(point);
}
}
return cost;
}
}
App.java
package net.coderodde.bio.msa;
final class App {
private static final String SEQUENCES = {
"ACGHKGMNPFQEKKFKLMNRW",
"CFGPQWYRTLMEKKFKNR",
"EACLMNRPQWTR",
"TIMWAYHTMGIEKKFK"
};
public static void main(String args) {
MultipleSequenceAlignmentInstance instance =
new MultipleSequenceAlignmentInstance(
PAM250CostMatrix.getPAM250CostMatrix(),
4,
SEQUENCES);
long start = System.currentTimeMillis();
Alignment alignment1 = instance.align();
long end = System.currentTimeMillis();
System.out.println(alignment1);
System.out.println(end - start + " ms.");
System.out.println();
start = System.currentTimeMillis();
Alignment alignment2 = instance.alignBrute();
end = System.currentTimeMillis();
System.out.println(alignment2);
System.out.println(end - start + " ms.");
}
}
Output
The output for the demo is
Computed heuristic function in 1003 milliseconds.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
1126 ms.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
60 ms.
Critique request
As always, please tell me anything that comes to mind.
java algorithm bioinformatics
add a comment |
(The entire project lives here.)
Problem definition
Suppose we are given three genomic strings drawn from the alphabet of 20 amino acids:
ACGH
CFG
EAC
In multiple sequence alignment problem we want to align them optimally in order to access their evolutionary similarity. Such an alignment may look like the following:
-AC-GH
--CFG-
EAC---
Solution
Often, the problem is cast as a pathfinding problem in $k$-dimensional lattice, where $k$ is the number of strings. Here is my program, it compares A* with Dijkstra's algorithms:
MultipleSequenceAlignmentInstance.java
package net.coderodde.bio.msa;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
public final class MultipleSequenceAlignmentInstance {
/**
* The penalty for pairs in which there is one valid character and one gap.
*/
private final int gapPenalty;
/**
* The character pairs cost matrix.
*/
private final CostMatrix<Integer> costMatrix;
/**
* The array of sequences to be aligned.
*/
private final String sequenceArray;
// A small speed optimization:
private final char column;
public MultipleSequenceAlignmentInstance(CostMatrix<Integer> costMatrix,
int gapPenalty,
String... sequenceArray) {
this.costMatrix = Objects.requireNonNull(costMatrix,
"Cost matrix is null");
this.gapPenalty = gapPenalty;
this.sequenceArray = sequenceArray.clone();
this.column = new char[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
this.sequenceArray[i] =
checkIsValidGenomicSequence(sequenceArray[i]);
}
}
public Alignment align() {
long start = System.currentTimeMillis();
HeuristicFunction hf = new HeuristicFunctionComputer()
.computeHeuristicFunction(this);
long end = System.currentTimeMillis();
System.out.println("Computed heuristic function in " + (end - start) +
" milliseconds.");
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode,
tentativeCost +
hf.get(childNode)));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
public Alignment alignBrute() {
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode, tentativeCost));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
/**
* Create the source node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to zero.
*
* @return the source node.
*/
LatticeNode getSourceNode() {
int sourceCoordinates = new int[sequenceArray.length];
return new LatticeNode(this, sourceCoordinates);
}
/**
* Create the target node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to the respective sequence lengths.
*
* @return the target node.
*/
LatticeNode getTargetNode() {
int targetCoordinates = new int[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
targetCoordinates[i] = sequenceArray[i].length();
}
return new LatticeNode(this, targetCoordinates);
}
Integer getWeight(LatticeNode tail,
LatticeNode head,
int dimension1,
int dimension2) {
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
if (tailCoordinates[dimension1] == headCoordinates[dimension1]) {
//System.out.println("1");
return gapPenalty;
} else if (tailCoordinates[dimension2] == headCoordinates[dimension2]) {
//System.out.println("2");
return gapPenalty;
} else {
// System.out.println("3");
char character1 = sequenceArray[dimension1]
.charAt(tailCoordinates[dimension1]);
char character2 = sequenceArray[dimension2]
.charAt(tailCoordinates[dimension2]);
return costMatrix.getCost(character1, character2);
}
}
Integer getWeight(LatticeNode tail, LatticeNode head) {
// Extract the column represented by taking a single hop from 'tail' to
// 'head':
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int i = 0; i < sequenceArray.length; ++i) {
if (tailCoordinates[i] + 1 == headCoordinates[i]) {
column[i] = sequenceArray[i].charAt(tailCoordinates[i]);
} else {
column[i] = AminoAcidAlphabet.GAP_CHARACTER;
}
}
// Compute the hop cost as the sum of pairwise hops in any plane:
int cost = 0;
for (int i = 0; i < column.length; ++i) {
char character1 = column[i];
for (int j = i + 1; j < column.length; ++j) {
char character2 = column[j];
if (character1 != AminoAcidAlphabet.GAP_CHARACTER) {
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += costMatrix.getCost(character1, character2);
} else {
cost += gapPenalty;
}
} else {
// character1 IS the gap character:
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += gapPenalty;
} else {
// Do nothing since we have a pair (gap, gap).
}
}
}
}
return cost;
}
String getSequenceArray() {
return sequenceArray;
}
private Alignment tracebackPath(Map<LatticeNode, LatticeNode> parents,
Integer cost) {
List<LatticeNode> path = new ArrayList<>();
LatticeNode node = getTargetNode();
while (node != null) {
path.add(node);
node = parents.get(node);
}
Collections.<LatticeNode>reverse(path);
String strings = new String[getSequenceArray().length];
StringBuilder stringBuilders = new StringBuilder[strings.length];
for (int i = 0; i < stringBuilders.length; ++i) {
stringBuilders[i] = new StringBuilder();
}
for (int i = 1; i < path.size(); ++i) {
LatticeNode tail = path.get(i - 1);
LatticeNode head = path.get(i);
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int j = 0; j < tailCoordinates.length; ++j) {
if (tailCoordinates[j] != headCoordinates[j]) {
stringBuilders[j].
append(sequenceArray[j].charAt(tailCoordinates[j]));
} else {
stringBuilders[j].append(AminoAcidAlphabet.GAP_CHARACTER);
}
}
}
for (int i = 0; i < strings.length; ++i) {
strings[i] = stringBuilders[i].toString();
}
return new Alignment(strings, cost);
}
private String checkIsValidGenomicSequence(String string) {
Set<Character> characterSet =
AminoAcidAlphabet.getAminoAcidAlphabet()
.getCharacterSet();
for (char c : string.toCharArray()) {
if (!characterSet.contains(c)) {
throw new IllegalArgumentException("Unknown amino acid: " + c);
}
}
return string;
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final Integer fScore;
private final LatticeNode node;
LatticeNodeHolder(LatticeNode node, Integer fScore) {
this.fScore = fScore;
this.node = node;
}
LatticeNode getNode() {
return node;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(fScore, o.fScore);
}
}
}
HeuristicFunctionComputer.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
final class HeuristicFunctionComputer {
HeuristicFunction computeHeuristicFunction(
MultipleSequenceAlignmentInstance instance) {
String sequenceArray = instance.getSequenceArray();
HeuristicFunction heuristicFunction = new HeuristicFunction(instance);
for (int dimension1 = 0;
dimension1 < sequenceArray.length;
dimension1++) {
for (int dimension2 = dimension1 + 1;
dimension2 < sequenceArray.length;
dimension2++) {
loadPartialHeuristicFunction(heuristicFunction,
dimension1,
dimension2,
instance);
}
}
return heuristicFunction;
}
// Basically, this method runs Dijkstra backwards from the target node until
// the entire 2D-grid is explored.
private void loadPartialHeuristicFunction(
HeuristicFunction heuristicFunction,
int dimension1,
int dimension2,
MultipleSequenceAlignmentInstance instance) {
LatticeNode target = instance.getTargetNode();
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Set<LatticeNode> closed = new HashSet<>();
open.add(new LatticeNodeHolder(target, 0));
Point point = new Point();
while (!open.isEmpty()) {
LatticeNodeHolder currentNodeHolder = open.remove();
LatticeNode currentNode = currentNodeHolder.getLatticeNode();
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
Integer currentNodeCost = currentNodeHolder.getCost();
extractPoint(point, currentNode, dimension1, dimension2);
heuristicFunction.put(dimension1,
dimension2,
new Point(point),
currentNodeCost);
for (LatticeNode parent : currentNode.getParents()) {
if (closed.contains(parent)) {
continue;
}
int tentativeCost =
currentNodeCost + instance.getWeight(parent,
currentNode,
dimension1,
dimension2);
extractPoint(point, parent, dimension1, dimension2);
if (!heuristicFunction.containsPartial(dimension1,
dimension2,
point)
|| heuristicFunction.getPartial(dimension1,
dimension2,
point)
> tentativeCost) {
open.add(new LatticeNodeHolder(parent, tentativeCost));
}
}
}
}
private static void extractPoint(Point point,
LatticeNode latticeNode,
int i,
int j) {
int coordinates = latticeNode.getCoordinates();
point.x = coordinates[i];
point.y = coordinates[j];
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final LatticeNode node;
private final Integer cost;
LatticeNodeHolder(LatticeNode node, Integer cost) {
this.node = node;
this.cost = cost;
}
LatticeNode getLatticeNode() {
return node;
}
Integer getCost() {
return cost;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(cost, o.cost);
}
}
}
LatticeNode.java
package net.coderodde.bio.msa;
import com.sun.corba.se.spi.orb.OperationFactory;
import java.util.Arrays;
final class LatticeNode {
/**
* The problem instance this lattice node contributes to.
*/
private final MultipleSequenceAlignmentInstance instance;
/**
* The coordinates of this node in the entire lattice.
*/
private final int coordinates;
LatticeNode(MultipleSequenceAlignmentInstance instance, int coordinates) {
this.instance = instance;
this.coordinates = coordinates;
}
@Override
public boolean equals(Object o) {
return Arrays.equals(coordinates, ((LatticeNode) o).coordinates);
}
// Generated by NetBeans 8.1:
@Override
public int hashCode() {
int hash = 7;
hash = 41 * hash + Arrays.hashCode(this.coordinates);
return hash;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[");
String separator = "";
for (int coordinate : coordinates) {
sb.append(separator).append(coordinate);
separator = ", ";
}
return sb.append("]").toString();
}
LatticeNode getChildren() {
// Find out in how many dimension we can move forward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] < sequenceArray[i].length()) {
dimensionsNotReached++;
// We can make a step forward in the direction of ith dimension:
inclusionMap[i] = true;
}
}
// Create the array of children:
int numberOfChildren = pow2(dimensionsNotReached) - 1;
LatticeNode children = new LatticeNode[numberOfChildren];
loadChildren(children, inclusionMap);
// Convert offsets to actual child coordinates:
for (LatticeNode child : children) {
child.addOffsets(coordinates);
}
return children;
}
LatticeNode getParents() {
// Find out in how many dimensions we can move BACKward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] > 0) {
dimensionsNotReached++;
// We can make a step backwards in the direction of ith
// dimension:
inclusionMap[i] = true;
}
}
// Create the array of parents:
int numberOfParents = pow2(dimensionsNotReached) - 1;
LatticeNode parents = new LatticeNode[numberOfParents];
loadParents(parents, inclusionMap);
// Convert the offsets to actual parent coordinates:
for (LatticeNode parent : parents) {
parent.addOffsets(coordinates);
}
return parents;
}
int getCoordinates() {
return coordinates;
}
private void loadChildren(LatticeNode children, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != children.length; ++i) {
increment(coords, inclusionMap);
children[i] = new LatticeNode(instance, coords.clone());
}
}
private void loadParents(LatticeNode parents, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != parents.length; ++i) {
decrement(coords, inclusionMap);
parents[i] = new LatticeNode(instance, coords.clone());
}
}
private static void increment(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = 1;
return;
}
coordinates[i] = 0;
}
}
private static void decrement(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = -1;
return;
}
coordinates[i] = 0;
}
}
private void addOffsets(int offsets) {
for (int i = 0; i < coordinates.length; ++i) {
coordinates[i] += offsets[i];
}
}
private static int pow2(int exponent) {
int ret = 1;
for (int e = 0; e < exponent; ++e) {
ret *= 2;
}
return ret;
}
}
PAM250CostMatrix.java
package net.coderodde.bio.msa;
import java.util.HashMap;
import java.util.Map;
import net.coderodde.bio.msa.AminoAcidAlphabet;
import net.coderodde.bio.msa.CostMatrix;
public final class PAM250CostMatrix implements CostMatrix<Integer> {
private static PAM250CostMatrix instance;
private final Map<Character, Map<Character, Integer>> m = new HashMap<>();
public static PAM250CostMatrix getPAM250CostMatrix() {
if (instance == null) {
instance = new PAM250CostMatrix();
}
return instance;
}
private PAM250CostMatrix() {
AminoAcidAlphabet alphabet = AminoAcidAlphabet.getAminoAcidAlphabet();
alphabet.getCharacterSet().stream().forEach((character) -> {
m.put(character, new HashMap<>());
});
m.get('A').put('A', -2);
m.get('R').put('A', 2);
m.get('A').put('R', 2);
m.get('R').put('R', -6);
m.get('N').put('A', 0);
m.get('A').put('N', 0);
m.get('N').put('R', 0);
m.get('R').put('N', 0);
m.get('N').put('N', -2);
m.get('D').put('A', 0);
m.get('A').put('D', 0);
m.get('D').put('R', 1);
m.get('R').put('D', 1);
m.get('D').put('N', -2);
m.get('N').put('D', -2);
m.get('D').put('D', -4);
m.get('C').put('A', 2);
m.get('A').put('C', 2);
m.get('C').put('R', 4);
m.get('R').put('C', 4);
m.get('C').put('N', 4);
m.get('N').put('C', 4);
m.get('C').put('D', 5);
m.get('D').put('C', 5);
m.get('C').put('C', -4);
m.get('Q').put('A', 0);
m.get('A').put('Q', 0);
m.get('Q').put('R', -1);
m.get('R').put('Q', -1);
m.get('Q').put('N', -1);
m.get('N').put('Q', -1);
m.get('Q').put('D', -2);
m.get('D').put('Q', -2);
m.get('Q').put('C', 5);
m.get('C').put('Q', 5);
m.get('Q').put('Q', -4);
m.get('E').put('A', 0);
m.get('A').put('E', 0);
m.get('E').put('R', 1);
m.get('R').put('E', 1);
m.get('E').put('N', -1);
m.get('N').put('E', -1);
m.get('E').put('D', -3);
m.get('D').put('E', -3);
m.get('E').put('C', 5);
m.get('C').put('E', 5);
m.get('E').put('Q', -2);
m.get('Q').put('E', -2);
m.get('E').put('E', -4);
m.get('G').put('A', -1);
m.get('A').put('G', -1);
m.get('G').put('R', 3);
m.get('R').put('G', 3);
m.get('G').put('N', 0);
m.get('N').put('G', 0);
m.get('G').put('D', -1);
m.get('D').put('G', -1);
m.get('G').put('C', 3);
m.get('C').put('G', 3);
m.get('G').put('Q', 1);
m.get('Q').put('G', 1);
m.get('G').put('E', 0);
m.get('E').put('G', 0);
m.get('G').put('G', -5);
m.get('H').put('A', 1);
m.get('A').put('H', 1);
m.get('H').put('R', -2);
m.get('R').put('H', -2);
m.get('H').put('N', -2);
m.get('N').put('H', -2);
m.get('H').put('D', -1);
m.get('D').put('H', -1);
m.get('H').put('C', 3);
m.get('C').put('H', 3);
m.get('H').put('Q', -3);
m.get('Q').put('H', -3);
m.get('H').put('E', -1);
m.get('E').put('H', -1);
m.get('H').put('G', 2);
m.get('G').put('H', 2);
m.get('H').put('H', -6);
m.get('I').put('A', 1);
m.get('A').put('I', 1);
m.get('I').put('R', 2);
m.get('R').put('I', 2);
m.get('I').put('N', 2);
m.get('N').put('I', 2);
m.get('I').put('D', 2);
m.get('D').put('I', 2);
m.get('I').put('C', 2);
m.get('C').put('I', 2);
m.get('I').put('Q', 2);
m.get('Q').put('I', 2);
m.get('I').put('E', 2);
m.get('E').put('I', 2);
m.get('I').put('G', 3);
m.get('G').put('I', 3);
m.get('I').put('H', 2);
m.get('H').put('I', 2);
m.get('I').put('I', -5);
m.get('L').put('A', 2);
m.get('A').put('L', 2);
m.get('L').put('R', 3);
m.get('R').put('L', 3);
m.get('L').put('N', 3);
m.get('N').put('L', 3);
m.get('L').put('D', 4);
m.get('D').put('L', 4);
m.get('L').put('C', 6);
m.get('C').put('L', 6);
m.get('L').put('Q', 2);
m.get('Q').put('L', 2);
m.get('L').put('E', 3);
m.get('E').put('L', 3);
m.get('L').put('G', 4);
m.get('G').put('L', 4);
m.get('L').put('H', 2);
m.get('H').put('L', 2);
m.get('L').put('I', -2);
m.get('I').put('L', -2);
m.get('L').put('L', -6);
m.get('K').put('A', 1);
m.get('A').put('K', 1);
m.get('K').put('R', -3);
m.get('R').put('K', -3);
m.get('K').put('N', -1);
m.get('N').put('K', -1);
m.get('K').put('D', 0);
m.get('D').put('K', 0);
m.get('K').put('C', 5);
m.get('C').put('K', 5);
m.get('K').put('Q', -1);
m.get('Q').put('K', -1);
m.get('K').put('E', 0);
m.get('E').put('K', 0);
m.get('K').put('G', 2);
m.get('G').put('K', 2);
m.get('K').put('H', 0);
m.get('H').put('K', 0);
m.get('K').put('I', 2);
m.get('I').put('K', 2);
m.get('K').put('L', 3);
m.get('L').put('K', 3);
m.get('K').put('K', -5);
m.get('M').put('A', 1);
m.get('A').put('M', 1);
m.get('M').put('R', 0);
m.get('R').put('M', 0);
m.get('M').put('N', 2);
m.get('N').put('M', 2);
m.get('M').put('D', 3);
m.get('D').put('M', 3);
m.get('M').put('C', 5);
m.get('C').put('M', 5);
m.get('M').put('Q', 1);
m.get('Q').put('M', 1);
m.get('M').put('E', 2);
m.get('E').put('M', 2);
m.get('M').put('G', 3);
m.get('G').put('M', 3);
m.get('M').put('H', 2);
m.get('H').put('M', 2);
m.get('M').put('I', -2);
m.get('I').put('M', -2);
m.get('M').put('L', -4);
m.get('L').put('M', -4);
m.get('M').put('K', 0);
m.get('K').put('M', 0);
m.get('M').put('M', -6);
m.get('F').put('A', 4);
m.get('A').put('F', 4);
m.get('F').put('R', 4);
m.get('R').put('F', 4);
m.get('F').put('N', 4);
m.get('N').put('F', 4);
m.get('F').put('D', 6);
m.get('D').put('F', 6);
m.get('F').put('C', 4);
m.get('C').put('F', 4);
m.get('F').put('Q', 5);
m.get('Q').put('F', 5);
m.get('F').put('E', 5);
m.get('E').put('F', 5);
m.get('F').put('G', 5);
m.get('G').put('F', 5);
m.get('F').put('H', 2);
m.get('H').put('F', 2);
m.get('F').put('I', -1);
m.get('I').put('F', -1);
m.get('F').put('L', -2);
m.get('L').put('F', -2);
m.get('F').put('K', 5);
m.get('K').put('F', 5);
m.get('F').put('M', 0);
m.get('M').put('F', 0);
m.get('F').put('F', -9);
m.get('P').put('A', -1);
m.get('A').put('P', -1);
m.get('P').put('R', 0);
m.get('R').put('P', 0);
m.get('P').put('N', 1);
m.get('N').put('P', 1);
m.get('P').put('D', 1);
m.get('D').put('P', 1);
m.get('P').put('C', 3);
m.get('C').put('P', 3);
m.get('P').put('Q', 0);
m.get('Q').put('P', 0);
m.get('P').put('E', 1);
m.get('E').put('P', 1);
m.get('P').put('G', 1);
m.get('G').put('P', 1);
m.get('P').put('H', 0);
m.get('H').put('P', 0);
m.get('P').put('I', 2);
m.get('I').put('P', 2);
m.get('P').put('L', 3);
m.get('L').put('P', 3);
m.get('P').put('K', 1);
m.get('K').put('P', 1);
m.get('P').put('M', 2);
m.get('M').put('P', 2);
m.get('P').put('F', 5);
m.get('F').put('P', 5);
m.get('P').put('P', -6);
m.get('S').put('A', -1);
m.get('A').put('S', -1);
m.get('S').put('R', 0);
m.get('R').put('S', 0);
m.get('S').put('N', -1);
m.get('N').put('S', -1);
m.get('S').put('D', 0);
m.get('D').put('S', 0);
m.get('S').put('C', 0);
m.get('C').put('S', 0);
m.get('S').put('Q', 1);
m.get('Q').put('S', 1);
m.get('S').put('E', 0);
m.get('E').put('S', 0);
m.get('S').put('G', -1);
m.get('G').put('S', -1);
m.get('S').put('H', 1);
m.get('H').put('S', 1);
m.get('S').put('I', 1);
m.get('I').put('S', 1);
m.get('S').put('L', 3);
m.get('L').put('S', 3);
m.get('S').put('K', 0);
m.get('K').put('S', 0);
m.get('S').put('M', 2);
m.get('M').put('S', 2);
m.get('S').put('F', 3);
m.get('F').put('S', 3);
m.get('S').put('P', -1);
m.get('P').put('S', -1);
m.get('S').put('S', -3);
m.get('T').put('A', -1);
m.get('A').put('T', -1);
m.get('T').put('R', 1);
m.get('R').put('T', 1);
m.get('T').put('N', 0);
m.get('N').put('T', 0);
m.get('T').put('D', 0);
m.get('D').put('T', 0);
m.get('T').put('C', 2);
m.get('C').put('T', 2);
m.get('T').put('Q', 1);
m.get('Q').put('T', 1);
m.get('T').put('E', 0);
m.get('E').put('T', 0);
m.get('T').put('G', 0);
m.get('G').put('T', 0);
m.get('T').put('H', 1);
m.get('H').put('T', 1);
m.get('T').put('I', 0);
m.get('I').put('T', 0);
m.get('T').put('L', 2);
m.get('L').put('T', 2);
m.get('T').put('K', 0);
m.get('K').put('T', 0);
m.get('T').put('M', 1);
m.get('M').put('T', 1);
m.get('T').put('F', 2);
m.get('F').put('T', 2);
m.get('T').put('P', 0);
m.get('P').put('T', 0);
m.get('T').put('S', -1);
m.get('S').put('T', -1);
m.get('T').put('T', -3);
m.get('W').put('A', 6);
m.get('A').put('W', 6);
m.get('W').put('R', -2);
m.get('R').put('W', -2);
m.get('W').put('N', 4);
m.get('N').put('W', 4);
m.get('W').put('D', 7);
m.get('D').put('W', 7);
m.get('W').put('C', 8);
m.get('C').put('W', 8);
m.get('W').put('Q', 5);
m.get('Q').put('W', 5);
m.get('W').put('E', 7);
m.get('E').put('W', 7);
m.get('W').put('G', 7);
m.get('G').put('W', 7);
m.get('W').put('H', 3);
m.get('H').put('W', 3);
m.get('W').put('I', 5);
m.get('I').put('W', 5);
m.get('W').put('L', 2);
m.get('L').put('W', 2);
m.get('W').put('K', 3);
m.get('K').put('W', 3);
m.get('W').put('M', 4);
m.get('M').put('W', 4);
m.get('W').put('F', 0);
m.get('F').put('W', 0);
m.get('W').put('P', 6);
m.get('P').put('W', 6);
m.get('W').put('S', 2);
m.get('S').put('W', 2);
m.get('W').put('T', 5);
m.get('T').put('W', 5);
m.get('W').put('W', -17);
m.get('Y').put('A', 3);
m.get('A').put('Y', 3);
m.get('Y').put('R', 4);
m.get('R').put('Y', 4);
m.get('Y').put('N', 2);
m.get('N').put('Y', 2);
m.get('Y').put('D', 4);
m.get('D').put('Y', 4);
m.get('Y').put('C', 0);
m.get('C').put('Y', 0);
m.get('Y').put('Q', 4);
m.get('Q').put('Y', 4);
m.get('Y').put('E', 4);
m.get('E').put('Y', 4);
m.get('Y').put('G', 5);
m.get('G').put('Y', 5);
m.get('Y').put('H', 0);
m.get('H').put('Y', 0);
m.get('Y').put('I', 1);
m.get('I').put('Y', 1);
m.get('Y').put('L', 1);
m.get('L').put('Y', 1);
m.get('Y').put('K', 4);
m.get('K').put('Y', 4);
m.get('Y').put('M', 2);
m.get('M').put('Y', 2);
m.get('Y').put('F', -7);
m.get('F').put('Y', -7);
m.get('Y').put('P', 5);
m.get('P').put('Y', 5);
m.get('Y').put('S', 3);
m.get('S').put('Y', 3);
m.get('Y').put('T', 3);
m.get('T').put('Y', 3);
m.get('Y').put('W', 0);
m.get('W').put('Y', 0);
m.get('Y').put('Y', -10);
m.get('V').put('A', 0);
m.get('A').put('V', 0);
m.get('V').put('R', 2);
m.get('R').put('V', 2);
m.get('V').put('N', 2);
m.get('N').put('V', 2);
m.get('V').put('D', 2);
m.get('D').put('V', 2);
m.get('V').put('C', 2);
m.get('C').put('V', 2);
m.get('V').put('Q', 2);
m.get('Q').put('V', 2);
m.get('V').put('E', 2);
m.get('E').put('V', 2);
m.get('V').put('G', 1);
m.get('G').put('V', 1);
m.get('V').put('H', 2);
m.get('H').put('V', 2);
m.get('V').put('I', -4);
m.get('I').put('V', -4);
m.get('V').put('L', -2);
m.get('L').put('V', -2);
m.get('V').put('K', 2);
m.get('K').put('V', 2);
m.get('V').put('M', -2);
m.get('M').put('V', -2);
m.get('V').put('F', 1);
m.get('F').put('V', 1);
m.get('V').put('P', 1);
m.get('P').put('V', 1);
m.get('V').put('S', 1);
m.get('S').put('V', 1);
m.get('V').put('T', 0);
m.get('T').put('V', 0);
m.get('V').put('W', 6);
m.get('W').put('V', 6);
m.get('V').put('Y', 2);
m.get('Y').put('V', 2);
m.get('V').put('V', -4);
}
@Override
public Integer getCost(Character aminoAcidChar1, Character aminoAcidChar2) {
try {
return m.get(aminoAcidChar1).get(aminoAcidChar2);
} catch (NullPointerException ex) {
throw new IllegalArgumentException("Bad arguments: (" +
aminoAcidChar1 + ", " + aminoAcidChar2 + ")",
ex);
}
}
}
AminoAcidAlphabet.java
package net.coderodde.bio.msa;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
public final class AminoAcidAlphabet {
public static final Character GAP_CHARACTER = '-';
private static AminoAcidAlphabet instance;
private final Set<Character> alphabet = new HashSet<>();
public static AminoAcidAlphabet getAminoAcidAlphabet() {
if (instance == null) {
instance = new AminoAcidAlphabet();
}
return instance;
}
private AminoAcidAlphabet() {
String aminoAcids = "ACDEF" +
"GHIKL" +
"MNPQR" +
"STVWY";
for (char c : aminoAcids.toCharArray()) {
alphabet.add(c);
}
}
public Set<Character> getCharacterSet() {
return Collections.<Character>unmodifiableSet(alphabet);
}
}
Alignment.java
package net.coderodde.bio.msa;
public class Alignment {
private final String alignment;
private final int cost;
Alignment(String alignment, int cost) {
this.alignment = alignment;
this.cost = cost;
}
public String getAlignemnt() {
return alignment;
}
public int getCost() {
return cost;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String separator = "";
for (String row : alignment) {
sb.append(separator).append(row);
separator = "n";
}
sb.append("nCost: ").append(cost);
return sb.toString();
}
}
CostMatrix.java
package net.coderodde.bio.msa;
public interface CostMatrix<C> {
public C getCost(Character aminoAcidChar1, Character aminoAcidChar2);
}
HeuristicFunction.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashMap;
import java.util.Map;
final class HeuristicFunction {
private final Map<Point, Integer> matrix;
HeuristicFunction(MultipleSequenceAlignmentInstance instance) {
int sequences = instance.getSequenceArray().length;
this.matrix = new Map[sequences];
for (int i = 0; i < sequences; ++i) {
this.matrix[i] = new Map[sequences];
for (int j = i + 1; j < sequences; ++j) {
this.matrix[i][j] = new HashMap<>();
}
}
}
void put(int dimension1, int dimension2, Point point, Integer cost) {
matrix[dimension1][dimension2].put(point, cost);
}
boolean containsPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].containsKey(point);
}
Integer getPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].get(point);
}
Integer get(LatticeNode node) {
int cost = 0;
int coordinates = node.getCoordinates();
Point point = new Point();
for (int dimension1 = 0;
dimension1 < coordinates.length;
dimension1++) {
point.x = dimension1;
for (int dimension2 = dimension1 + 1;
dimension2 < coordinates.length;
dimension2++) {
point.y = dimension2;
cost += matrix[dimension1][dimension2].get(point);
}
}
return cost;
}
}
App.java
package net.coderodde.bio.msa;
final class App {
private static final String SEQUENCES = {
"ACGHKGMNPFQEKKFKLMNRW",
"CFGPQWYRTLMEKKFKNR",
"EACLMNRPQWTR",
"TIMWAYHTMGIEKKFK"
};
public static void main(String args) {
MultipleSequenceAlignmentInstance instance =
new MultipleSequenceAlignmentInstance(
PAM250CostMatrix.getPAM250CostMatrix(),
4,
SEQUENCES);
long start = System.currentTimeMillis();
Alignment alignment1 = instance.align();
long end = System.currentTimeMillis();
System.out.println(alignment1);
System.out.println(end - start + " ms.");
System.out.println();
start = System.currentTimeMillis();
Alignment alignment2 = instance.alignBrute();
end = System.currentTimeMillis();
System.out.println(alignment2);
System.out.println(end - start + " ms.");
}
}
Output
The output for the demo is
Computed heuristic function in 1003 milliseconds.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
1126 ms.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
60 ms.
Critique request
As always, please tell me anything that comes to mind.
java algorithm bioinformatics
add a comment |
(The entire project lives here.)
Problem definition
Suppose we are given three genomic strings drawn from the alphabet of 20 amino acids:
ACGH
CFG
EAC
In multiple sequence alignment problem we want to align them optimally in order to access their evolutionary similarity. Such an alignment may look like the following:
-AC-GH
--CFG-
EAC---
Solution
Often, the problem is cast as a pathfinding problem in $k$-dimensional lattice, where $k$ is the number of strings. Here is my program, it compares A* with Dijkstra's algorithms:
MultipleSequenceAlignmentInstance.java
package net.coderodde.bio.msa;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
public final class MultipleSequenceAlignmentInstance {
/**
* The penalty for pairs in which there is one valid character and one gap.
*/
private final int gapPenalty;
/**
* The character pairs cost matrix.
*/
private final CostMatrix<Integer> costMatrix;
/**
* The array of sequences to be aligned.
*/
private final String sequenceArray;
// A small speed optimization:
private final char column;
public MultipleSequenceAlignmentInstance(CostMatrix<Integer> costMatrix,
int gapPenalty,
String... sequenceArray) {
this.costMatrix = Objects.requireNonNull(costMatrix,
"Cost matrix is null");
this.gapPenalty = gapPenalty;
this.sequenceArray = sequenceArray.clone();
this.column = new char[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
this.sequenceArray[i] =
checkIsValidGenomicSequence(sequenceArray[i]);
}
}
public Alignment align() {
long start = System.currentTimeMillis();
HeuristicFunction hf = new HeuristicFunctionComputer()
.computeHeuristicFunction(this);
long end = System.currentTimeMillis();
System.out.println("Computed heuristic function in " + (end - start) +
" milliseconds.");
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode,
tentativeCost +
hf.get(childNode)));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
public Alignment alignBrute() {
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode, tentativeCost));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
/**
* Create the source node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to zero.
*
* @return the source node.
*/
LatticeNode getSourceNode() {
int sourceCoordinates = new int[sequenceArray.length];
return new LatticeNode(this, sourceCoordinates);
}
/**
* Create the target node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to the respective sequence lengths.
*
* @return the target node.
*/
LatticeNode getTargetNode() {
int targetCoordinates = new int[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
targetCoordinates[i] = sequenceArray[i].length();
}
return new LatticeNode(this, targetCoordinates);
}
Integer getWeight(LatticeNode tail,
LatticeNode head,
int dimension1,
int dimension2) {
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
if (tailCoordinates[dimension1] == headCoordinates[dimension1]) {
//System.out.println("1");
return gapPenalty;
} else if (tailCoordinates[dimension2] == headCoordinates[dimension2]) {
//System.out.println("2");
return gapPenalty;
} else {
// System.out.println("3");
char character1 = sequenceArray[dimension1]
.charAt(tailCoordinates[dimension1]);
char character2 = sequenceArray[dimension2]
.charAt(tailCoordinates[dimension2]);
return costMatrix.getCost(character1, character2);
}
}
Integer getWeight(LatticeNode tail, LatticeNode head) {
// Extract the column represented by taking a single hop from 'tail' to
// 'head':
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int i = 0; i < sequenceArray.length; ++i) {
if (tailCoordinates[i] + 1 == headCoordinates[i]) {
column[i] = sequenceArray[i].charAt(tailCoordinates[i]);
} else {
column[i] = AminoAcidAlphabet.GAP_CHARACTER;
}
}
// Compute the hop cost as the sum of pairwise hops in any plane:
int cost = 0;
for (int i = 0; i < column.length; ++i) {
char character1 = column[i];
for (int j = i + 1; j < column.length; ++j) {
char character2 = column[j];
if (character1 != AminoAcidAlphabet.GAP_CHARACTER) {
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += costMatrix.getCost(character1, character2);
} else {
cost += gapPenalty;
}
} else {
// character1 IS the gap character:
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += gapPenalty;
} else {
// Do nothing since we have a pair (gap, gap).
}
}
}
}
return cost;
}
String getSequenceArray() {
return sequenceArray;
}
private Alignment tracebackPath(Map<LatticeNode, LatticeNode> parents,
Integer cost) {
List<LatticeNode> path = new ArrayList<>();
LatticeNode node = getTargetNode();
while (node != null) {
path.add(node);
node = parents.get(node);
}
Collections.<LatticeNode>reverse(path);
String strings = new String[getSequenceArray().length];
StringBuilder stringBuilders = new StringBuilder[strings.length];
for (int i = 0; i < stringBuilders.length; ++i) {
stringBuilders[i] = new StringBuilder();
}
for (int i = 1; i < path.size(); ++i) {
LatticeNode tail = path.get(i - 1);
LatticeNode head = path.get(i);
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int j = 0; j < tailCoordinates.length; ++j) {
if (tailCoordinates[j] != headCoordinates[j]) {
stringBuilders[j].
append(sequenceArray[j].charAt(tailCoordinates[j]));
} else {
stringBuilders[j].append(AminoAcidAlphabet.GAP_CHARACTER);
}
}
}
for (int i = 0; i < strings.length; ++i) {
strings[i] = stringBuilders[i].toString();
}
return new Alignment(strings, cost);
}
private String checkIsValidGenomicSequence(String string) {
Set<Character> characterSet =
AminoAcidAlphabet.getAminoAcidAlphabet()
.getCharacterSet();
for (char c : string.toCharArray()) {
if (!characterSet.contains(c)) {
throw new IllegalArgumentException("Unknown amino acid: " + c);
}
}
return string;
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final Integer fScore;
private final LatticeNode node;
LatticeNodeHolder(LatticeNode node, Integer fScore) {
this.fScore = fScore;
this.node = node;
}
LatticeNode getNode() {
return node;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(fScore, o.fScore);
}
}
}
HeuristicFunctionComputer.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
final class HeuristicFunctionComputer {
HeuristicFunction computeHeuristicFunction(
MultipleSequenceAlignmentInstance instance) {
String sequenceArray = instance.getSequenceArray();
HeuristicFunction heuristicFunction = new HeuristicFunction(instance);
for (int dimension1 = 0;
dimension1 < sequenceArray.length;
dimension1++) {
for (int dimension2 = dimension1 + 1;
dimension2 < sequenceArray.length;
dimension2++) {
loadPartialHeuristicFunction(heuristicFunction,
dimension1,
dimension2,
instance);
}
}
return heuristicFunction;
}
// Basically, this method runs Dijkstra backwards from the target node until
// the entire 2D-grid is explored.
private void loadPartialHeuristicFunction(
HeuristicFunction heuristicFunction,
int dimension1,
int dimension2,
MultipleSequenceAlignmentInstance instance) {
LatticeNode target = instance.getTargetNode();
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Set<LatticeNode> closed = new HashSet<>();
open.add(new LatticeNodeHolder(target, 0));
Point point = new Point();
while (!open.isEmpty()) {
LatticeNodeHolder currentNodeHolder = open.remove();
LatticeNode currentNode = currentNodeHolder.getLatticeNode();
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
Integer currentNodeCost = currentNodeHolder.getCost();
extractPoint(point, currentNode, dimension1, dimension2);
heuristicFunction.put(dimension1,
dimension2,
new Point(point),
currentNodeCost);
for (LatticeNode parent : currentNode.getParents()) {
if (closed.contains(parent)) {
continue;
}
int tentativeCost =
currentNodeCost + instance.getWeight(parent,
currentNode,
dimension1,
dimension2);
extractPoint(point, parent, dimension1, dimension2);
if (!heuristicFunction.containsPartial(dimension1,
dimension2,
point)
|| heuristicFunction.getPartial(dimension1,
dimension2,
point)
> tentativeCost) {
open.add(new LatticeNodeHolder(parent, tentativeCost));
}
}
}
}
private static void extractPoint(Point point,
LatticeNode latticeNode,
int i,
int j) {
int coordinates = latticeNode.getCoordinates();
point.x = coordinates[i];
point.y = coordinates[j];
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final LatticeNode node;
private final Integer cost;
LatticeNodeHolder(LatticeNode node, Integer cost) {
this.node = node;
this.cost = cost;
}
LatticeNode getLatticeNode() {
return node;
}
Integer getCost() {
return cost;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(cost, o.cost);
}
}
}
LatticeNode.java
package net.coderodde.bio.msa;
import com.sun.corba.se.spi.orb.OperationFactory;
import java.util.Arrays;
final class LatticeNode {
/**
* The problem instance this lattice node contributes to.
*/
private final MultipleSequenceAlignmentInstance instance;
/**
* The coordinates of this node in the entire lattice.
*/
private final int coordinates;
LatticeNode(MultipleSequenceAlignmentInstance instance, int coordinates) {
this.instance = instance;
this.coordinates = coordinates;
}
@Override
public boolean equals(Object o) {
return Arrays.equals(coordinates, ((LatticeNode) o).coordinates);
}
// Generated by NetBeans 8.1:
@Override
public int hashCode() {
int hash = 7;
hash = 41 * hash + Arrays.hashCode(this.coordinates);
return hash;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[");
String separator = "";
for (int coordinate : coordinates) {
sb.append(separator).append(coordinate);
separator = ", ";
}
return sb.append("]").toString();
}
LatticeNode getChildren() {
// Find out in how many dimension we can move forward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] < sequenceArray[i].length()) {
dimensionsNotReached++;
// We can make a step forward in the direction of ith dimension:
inclusionMap[i] = true;
}
}
// Create the array of children:
int numberOfChildren = pow2(dimensionsNotReached) - 1;
LatticeNode children = new LatticeNode[numberOfChildren];
loadChildren(children, inclusionMap);
// Convert offsets to actual child coordinates:
for (LatticeNode child : children) {
child.addOffsets(coordinates);
}
return children;
}
LatticeNode getParents() {
// Find out in how many dimensions we can move BACKward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] > 0) {
dimensionsNotReached++;
// We can make a step backwards in the direction of ith
// dimension:
inclusionMap[i] = true;
}
}
// Create the array of parents:
int numberOfParents = pow2(dimensionsNotReached) - 1;
LatticeNode parents = new LatticeNode[numberOfParents];
loadParents(parents, inclusionMap);
// Convert the offsets to actual parent coordinates:
for (LatticeNode parent : parents) {
parent.addOffsets(coordinates);
}
return parents;
}
int getCoordinates() {
return coordinates;
}
private void loadChildren(LatticeNode children, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != children.length; ++i) {
increment(coords, inclusionMap);
children[i] = new LatticeNode(instance, coords.clone());
}
}
private void loadParents(LatticeNode parents, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != parents.length; ++i) {
decrement(coords, inclusionMap);
parents[i] = new LatticeNode(instance, coords.clone());
}
}
private static void increment(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = 1;
return;
}
coordinates[i] = 0;
}
}
private static void decrement(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = -1;
return;
}
coordinates[i] = 0;
}
}
private void addOffsets(int offsets) {
for (int i = 0; i < coordinates.length; ++i) {
coordinates[i] += offsets[i];
}
}
private static int pow2(int exponent) {
int ret = 1;
for (int e = 0; e < exponent; ++e) {
ret *= 2;
}
return ret;
}
}
PAM250CostMatrix.java
package net.coderodde.bio.msa;
import java.util.HashMap;
import java.util.Map;
import net.coderodde.bio.msa.AminoAcidAlphabet;
import net.coderodde.bio.msa.CostMatrix;
public final class PAM250CostMatrix implements CostMatrix<Integer> {
private static PAM250CostMatrix instance;
private final Map<Character, Map<Character, Integer>> m = new HashMap<>();
public static PAM250CostMatrix getPAM250CostMatrix() {
if (instance == null) {
instance = new PAM250CostMatrix();
}
return instance;
}
private PAM250CostMatrix() {
AminoAcidAlphabet alphabet = AminoAcidAlphabet.getAminoAcidAlphabet();
alphabet.getCharacterSet().stream().forEach((character) -> {
m.put(character, new HashMap<>());
});
m.get('A').put('A', -2);
m.get('R').put('A', 2);
m.get('A').put('R', 2);
m.get('R').put('R', -6);
m.get('N').put('A', 0);
m.get('A').put('N', 0);
m.get('N').put('R', 0);
m.get('R').put('N', 0);
m.get('N').put('N', -2);
m.get('D').put('A', 0);
m.get('A').put('D', 0);
m.get('D').put('R', 1);
m.get('R').put('D', 1);
m.get('D').put('N', -2);
m.get('N').put('D', -2);
m.get('D').put('D', -4);
m.get('C').put('A', 2);
m.get('A').put('C', 2);
m.get('C').put('R', 4);
m.get('R').put('C', 4);
m.get('C').put('N', 4);
m.get('N').put('C', 4);
m.get('C').put('D', 5);
m.get('D').put('C', 5);
m.get('C').put('C', -4);
m.get('Q').put('A', 0);
m.get('A').put('Q', 0);
m.get('Q').put('R', -1);
m.get('R').put('Q', -1);
m.get('Q').put('N', -1);
m.get('N').put('Q', -1);
m.get('Q').put('D', -2);
m.get('D').put('Q', -2);
m.get('Q').put('C', 5);
m.get('C').put('Q', 5);
m.get('Q').put('Q', -4);
m.get('E').put('A', 0);
m.get('A').put('E', 0);
m.get('E').put('R', 1);
m.get('R').put('E', 1);
m.get('E').put('N', -1);
m.get('N').put('E', -1);
m.get('E').put('D', -3);
m.get('D').put('E', -3);
m.get('E').put('C', 5);
m.get('C').put('E', 5);
m.get('E').put('Q', -2);
m.get('Q').put('E', -2);
m.get('E').put('E', -4);
m.get('G').put('A', -1);
m.get('A').put('G', -1);
m.get('G').put('R', 3);
m.get('R').put('G', 3);
m.get('G').put('N', 0);
m.get('N').put('G', 0);
m.get('G').put('D', -1);
m.get('D').put('G', -1);
m.get('G').put('C', 3);
m.get('C').put('G', 3);
m.get('G').put('Q', 1);
m.get('Q').put('G', 1);
m.get('G').put('E', 0);
m.get('E').put('G', 0);
m.get('G').put('G', -5);
m.get('H').put('A', 1);
m.get('A').put('H', 1);
m.get('H').put('R', -2);
m.get('R').put('H', -2);
m.get('H').put('N', -2);
m.get('N').put('H', -2);
m.get('H').put('D', -1);
m.get('D').put('H', -1);
m.get('H').put('C', 3);
m.get('C').put('H', 3);
m.get('H').put('Q', -3);
m.get('Q').put('H', -3);
m.get('H').put('E', -1);
m.get('E').put('H', -1);
m.get('H').put('G', 2);
m.get('G').put('H', 2);
m.get('H').put('H', -6);
m.get('I').put('A', 1);
m.get('A').put('I', 1);
m.get('I').put('R', 2);
m.get('R').put('I', 2);
m.get('I').put('N', 2);
m.get('N').put('I', 2);
m.get('I').put('D', 2);
m.get('D').put('I', 2);
m.get('I').put('C', 2);
m.get('C').put('I', 2);
m.get('I').put('Q', 2);
m.get('Q').put('I', 2);
m.get('I').put('E', 2);
m.get('E').put('I', 2);
m.get('I').put('G', 3);
m.get('G').put('I', 3);
m.get('I').put('H', 2);
m.get('H').put('I', 2);
m.get('I').put('I', -5);
m.get('L').put('A', 2);
m.get('A').put('L', 2);
m.get('L').put('R', 3);
m.get('R').put('L', 3);
m.get('L').put('N', 3);
m.get('N').put('L', 3);
m.get('L').put('D', 4);
m.get('D').put('L', 4);
m.get('L').put('C', 6);
m.get('C').put('L', 6);
m.get('L').put('Q', 2);
m.get('Q').put('L', 2);
m.get('L').put('E', 3);
m.get('E').put('L', 3);
m.get('L').put('G', 4);
m.get('G').put('L', 4);
m.get('L').put('H', 2);
m.get('H').put('L', 2);
m.get('L').put('I', -2);
m.get('I').put('L', -2);
m.get('L').put('L', -6);
m.get('K').put('A', 1);
m.get('A').put('K', 1);
m.get('K').put('R', -3);
m.get('R').put('K', -3);
m.get('K').put('N', -1);
m.get('N').put('K', -1);
m.get('K').put('D', 0);
m.get('D').put('K', 0);
m.get('K').put('C', 5);
m.get('C').put('K', 5);
m.get('K').put('Q', -1);
m.get('Q').put('K', -1);
m.get('K').put('E', 0);
m.get('E').put('K', 0);
m.get('K').put('G', 2);
m.get('G').put('K', 2);
m.get('K').put('H', 0);
m.get('H').put('K', 0);
m.get('K').put('I', 2);
m.get('I').put('K', 2);
m.get('K').put('L', 3);
m.get('L').put('K', 3);
m.get('K').put('K', -5);
m.get('M').put('A', 1);
m.get('A').put('M', 1);
m.get('M').put('R', 0);
m.get('R').put('M', 0);
m.get('M').put('N', 2);
m.get('N').put('M', 2);
m.get('M').put('D', 3);
m.get('D').put('M', 3);
m.get('M').put('C', 5);
m.get('C').put('M', 5);
m.get('M').put('Q', 1);
m.get('Q').put('M', 1);
m.get('M').put('E', 2);
m.get('E').put('M', 2);
m.get('M').put('G', 3);
m.get('G').put('M', 3);
m.get('M').put('H', 2);
m.get('H').put('M', 2);
m.get('M').put('I', -2);
m.get('I').put('M', -2);
m.get('M').put('L', -4);
m.get('L').put('M', -4);
m.get('M').put('K', 0);
m.get('K').put('M', 0);
m.get('M').put('M', -6);
m.get('F').put('A', 4);
m.get('A').put('F', 4);
m.get('F').put('R', 4);
m.get('R').put('F', 4);
m.get('F').put('N', 4);
m.get('N').put('F', 4);
m.get('F').put('D', 6);
m.get('D').put('F', 6);
m.get('F').put('C', 4);
m.get('C').put('F', 4);
m.get('F').put('Q', 5);
m.get('Q').put('F', 5);
m.get('F').put('E', 5);
m.get('E').put('F', 5);
m.get('F').put('G', 5);
m.get('G').put('F', 5);
m.get('F').put('H', 2);
m.get('H').put('F', 2);
m.get('F').put('I', -1);
m.get('I').put('F', -1);
m.get('F').put('L', -2);
m.get('L').put('F', -2);
m.get('F').put('K', 5);
m.get('K').put('F', 5);
m.get('F').put('M', 0);
m.get('M').put('F', 0);
m.get('F').put('F', -9);
m.get('P').put('A', -1);
m.get('A').put('P', -1);
m.get('P').put('R', 0);
m.get('R').put('P', 0);
m.get('P').put('N', 1);
m.get('N').put('P', 1);
m.get('P').put('D', 1);
m.get('D').put('P', 1);
m.get('P').put('C', 3);
m.get('C').put('P', 3);
m.get('P').put('Q', 0);
m.get('Q').put('P', 0);
m.get('P').put('E', 1);
m.get('E').put('P', 1);
m.get('P').put('G', 1);
m.get('G').put('P', 1);
m.get('P').put('H', 0);
m.get('H').put('P', 0);
m.get('P').put('I', 2);
m.get('I').put('P', 2);
m.get('P').put('L', 3);
m.get('L').put('P', 3);
m.get('P').put('K', 1);
m.get('K').put('P', 1);
m.get('P').put('M', 2);
m.get('M').put('P', 2);
m.get('P').put('F', 5);
m.get('F').put('P', 5);
m.get('P').put('P', -6);
m.get('S').put('A', -1);
m.get('A').put('S', -1);
m.get('S').put('R', 0);
m.get('R').put('S', 0);
m.get('S').put('N', -1);
m.get('N').put('S', -1);
m.get('S').put('D', 0);
m.get('D').put('S', 0);
m.get('S').put('C', 0);
m.get('C').put('S', 0);
m.get('S').put('Q', 1);
m.get('Q').put('S', 1);
m.get('S').put('E', 0);
m.get('E').put('S', 0);
m.get('S').put('G', -1);
m.get('G').put('S', -1);
m.get('S').put('H', 1);
m.get('H').put('S', 1);
m.get('S').put('I', 1);
m.get('I').put('S', 1);
m.get('S').put('L', 3);
m.get('L').put('S', 3);
m.get('S').put('K', 0);
m.get('K').put('S', 0);
m.get('S').put('M', 2);
m.get('M').put('S', 2);
m.get('S').put('F', 3);
m.get('F').put('S', 3);
m.get('S').put('P', -1);
m.get('P').put('S', -1);
m.get('S').put('S', -3);
m.get('T').put('A', -1);
m.get('A').put('T', -1);
m.get('T').put('R', 1);
m.get('R').put('T', 1);
m.get('T').put('N', 0);
m.get('N').put('T', 0);
m.get('T').put('D', 0);
m.get('D').put('T', 0);
m.get('T').put('C', 2);
m.get('C').put('T', 2);
m.get('T').put('Q', 1);
m.get('Q').put('T', 1);
m.get('T').put('E', 0);
m.get('E').put('T', 0);
m.get('T').put('G', 0);
m.get('G').put('T', 0);
m.get('T').put('H', 1);
m.get('H').put('T', 1);
m.get('T').put('I', 0);
m.get('I').put('T', 0);
m.get('T').put('L', 2);
m.get('L').put('T', 2);
m.get('T').put('K', 0);
m.get('K').put('T', 0);
m.get('T').put('M', 1);
m.get('M').put('T', 1);
m.get('T').put('F', 2);
m.get('F').put('T', 2);
m.get('T').put('P', 0);
m.get('P').put('T', 0);
m.get('T').put('S', -1);
m.get('S').put('T', -1);
m.get('T').put('T', -3);
m.get('W').put('A', 6);
m.get('A').put('W', 6);
m.get('W').put('R', -2);
m.get('R').put('W', -2);
m.get('W').put('N', 4);
m.get('N').put('W', 4);
m.get('W').put('D', 7);
m.get('D').put('W', 7);
m.get('W').put('C', 8);
m.get('C').put('W', 8);
m.get('W').put('Q', 5);
m.get('Q').put('W', 5);
m.get('W').put('E', 7);
m.get('E').put('W', 7);
m.get('W').put('G', 7);
m.get('G').put('W', 7);
m.get('W').put('H', 3);
m.get('H').put('W', 3);
m.get('W').put('I', 5);
m.get('I').put('W', 5);
m.get('W').put('L', 2);
m.get('L').put('W', 2);
m.get('W').put('K', 3);
m.get('K').put('W', 3);
m.get('W').put('M', 4);
m.get('M').put('W', 4);
m.get('W').put('F', 0);
m.get('F').put('W', 0);
m.get('W').put('P', 6);
m.get('P').put('W', 6);
m.get('W').put('S', 2);
m.get('S').put('W', 2);
m.get('W').put('T', 5);
m.get('T').put('W', 5);
m.get('W').put('W', -17);
m.get('Y').put('A', 3);
m.get('A').put('Y', 3);
m.get('Y').put('R', 4);
m.get('R').put('Y', 4);
m.get('Y').put('N', 2);
m.get('N').put('Y', 2);
m.get('Y').put('D', 4);
m.get('D').put('Y', 4);
m.get('Y').put('C', 0);
m.get('C').put('Y', 0);
m.get('Y').put('Q', 4);
m.get('Q').put('Y', 4);
m.get('Y').put('E', 4);
m.get('E').put('Y', 4);
m.get('Y').put('G', 5);
m.get('G').put('Y', 5);
m.get('Y').put('H', 0);
m.get('H').put('Y', 0);
m.get('Y').put('I', 1);
m.get('I').put('Y', 1);
m.get('Y').put('L', 1);
m.get('L').put('Y', 1);
m.get('Y').put('K', 4);
m.get('K').put('Y', 4);
m.get('Y').put('M', 2);
m.get('M').put('Y', 2);
m.get('Y').put('F', -7);
m.get('F').put('Y', -7);
m.get('Y').put('P', 5);
m.get('P').put('Y', 5);
m.get('Y').put('S', 3);
m.get('S').put('Y', 3);
m.get('Y').put('T', 3);
m.get('T').put('Y', 3);
m.get('Y').put('W', 0);
m.get('W').put('Y', 0);
m.get('Y').put('Y', -10);
m.get('V').put('A', 0);
m.get('A').put('V', 0);
m.get('V').put('R', 2);
m.get('R').put('V', 2);
m.get('V').put('N', 2);
m.get('N').put('V', 2);
m.get('V').put('D', 2);
m.get('D').put('V', 2);
m.get('V').put('C', 2);
m.get('C').put('V', 2);
m.get('V').put('Q', 2);
m.get('Q').put('V', 2);
m.get('V').put('E', 2);
m.get('E').put('V', 2);
m.get('V').put('G', 1);
m.get('G').put('V', 1);
m.get('V').put('H', 2);
m.get('H').put('V', 2);
m.get('V').put('I', -4);
m.get('I').put('V', -4);
m.get('V').put('L', -2);
m.get('L').put('V', -2);
m.get('V').put('K', 2);
m.get('K').put('V', 2);
m.get('V').put('M', -2);
m.get('M').put('V', -2);
m.get('V').put('F', 1);
m.get('F').put('V', 1);
m.get('V').put('P', 1);
m.get('P').put('V', 1);
m.get('V').put('S', 1);
m.get('S').put('V', 1);
m.get('V').put('T', 0);
m.get('T').put('V', 0);
m.get('V').put('W', 6);
m.get('W').put('V', 6);
m.get('V').put('Y', 2);
m.get('Y').put('V', 2);
m.get('V').put('V', -4);
}
@Override
public Integer getCost(Character aminoAcidChar1, Character aminoAcidChar2) {
try {
return m.get(aminoAcidChar1).get(aminoAcidChar2);
} catch (NullPointerException ex) {
throw new IllegalArgumentException("Bad arguments: (" +
aminoAcidChar1 + ", " + aminoAcidChar2 + ")",
ex);
}
}
}
AminoAcidAlphabet.java
package net.coderodde.bio.msa;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
public final class AminoAcidAlphabet {
public static final Character GAP_CHARACTER = '-';
private static AminoAcidAlphabet instance;
private final Set<Character> alphabet = new HashSet<>();
public static AminoAcidAlphabet getAminoAcidAlphabet() {
if (instance == null) {
instance = new AminoAcidAlphabet();
}
return instance;
}
private AminoAcidAlphabet() {
String aminoAcids = "ACDEF" +
"GHIKL" +
"MNPQR" +
"STVWY";
for (char c : aminoAcids.toCharArray()) {
alphabet.add(c);
}
}
public Set<Character> getCharacterSet() {
return Collections.<Character>unmodifiableSet(alphabet);
}
}
Alignment.java
package net.coderodde.bio.msa;
public class Alignment {
private final String alignment;
private final int cost;
Alignment(String alignment, int cost) {
this.alignment = alignment;
this.cost = cost;
}
public String getAlignemnt() {
return alignment;
}
public int getCost() {
return cost;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String separator = "";
for (String row : alignment) {
sb.append(separator).append(row);
separator = "n";
}
sb.append("nCost: ").append(cost);
return sb.toString();
}
}
CostMatrix.java
package net.coderodde.bio.msa;
public interface CostMatrix<C> {
public C getCost(Character aminoAcidChar1, Character aminoAcidChar2);
}
HeuristicFunction.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashMap;
import java.util.Map;
final class HeuristicFunction {
private final Map<Point, Integer> matrix;
HeuristicFunction(MultipleSequenceAlignmentInstance instance) {
int sequences = instance.getSequenceArray().length;
this.matrix = new Map[sequences];
for (int i = 0; i < sequences; ++i) {
this.matrix[i] = new Map[sequences];
for (int j = i + 1; j < sequences; ++j) {
this.matrix[i][j] = new HashMap<>();
}
}
}
void put(int dimension1, int dimension2, Point point, Integer cost) {
matrix[dimension1][dimension2].put(point, cost);
}
boolean containsPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].containsKey(point);
}
Integer getPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].get(point);
}
Integer get(LatticeNode node) {
int cost = 0;
int coordinates = node.getCoordinates();
Point point = new Point();
for (int dimension1 = 0;
dimension1 < coordinates.length;
dimension1++) {
point.x = dimension1;
for (int dimension2 = dimension1 + 1;
dimension2 < coordinates.length;
dimension2++) {
point.y = dimension2;
cost += matrix[dimension1][dimension2].get(point);
}
}
return cost;
}
}
App.java
package net.coderodde.bio.msa;
final class App {
private static final String SEQUENCES = {
"ACGHKGMNPFQEKKFKLMNRW",
"CFGPQWYRTLMEKKFKNR",
"EACLMNRPQWTR",
"TIMWAYHTMGIEKKFK"
};
public static void main(String args) {
MultipleSequenceAlignmentInstance instance =
new MultipleSequenceAlignmentInstance(
PAM250CostMatrix.getPAM250CostMatrix(),
4,
SEQUENCES);
long start = System.currentTimeMillis();
Alignment alignment1 = instance.align();
long end = System.currentTimeMillis();
System.out.println(alignment1);
System.out.println(end - start + " ms.");
System.out.println();
start = System.currentTimeMillis();
Alignment alignment2 = instance.alignBrute();
end = System.currentTimeMillis();
System.out.println(alignment2);
System.out.println(end - start + " ms.");
}
}
Output
The output for the demo is
Computed heuristic function in 1003 milliseconds.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
1126 ms.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
60 ms.
Critique request
As always, please tell me anything that comes to mind.
java algorithm bioinformatics
(The entire project lives here.)
Problem definition
Suppose we are given three genomic strings drawn from the alphabet of 20 amino acids:
ACGH
CFG
EAC
In multiple sequence alignment problem we want to align them optimally in order to access their evolutionary similarity. Such an alignment may look like the following:
-AC-GH
--CFG-
EAC---
Solution
Often, the problem is cast as a pathfinding problem in $k$-dimensional lattice, where $k$ is the number of strings. Here is my program, it compares A* with Dijkstra's algorithms:
MultipleSequenceAlignmentInstance.java
package net.coderodde.bio.msa;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
public final class MultipleSequenceAlignmentInstance {
/**
* The penalty for pairs in which there is one valid character and one gap.
*/
private final int gapPenalty;
/**
* The character pairs cost matrix.
*/
private final CostMatrix<Integer> costMatrix;
/**
* The array of sequences to be aligned.
*/
private final String sequenceArray;
// A small speed optimization:
private final char column;
public MultipleSequenceAlignmentInstance(CostMatrix<Integer> costMatrix,
int gapPenalty,
String... sequenceArray) {
this.costMatrix = Objects.requireNonNull(costMatrix,
"Cost matrix is null");
this.gapPenalty = gapPenalty;
this.sequenceArray = sequenceArray.clone();
this.column = new char[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
this.sequenceArray[i] =
checkIsValidGenomicSequence(sequenceArray[i]);
}
}
public Alignment align() {
long start = System.currentTimeMillis();
HeuristicFunction hf = new HeuristicFunctionComputer()
.computeHeuristicFunction(this);
long end = System.currentTimeMillis();
System.out.println("Computed heuristic function in " + (end - start) +
" milliseconds.");
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode,
tentativeCost +
hf.get(childNode)));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
public Alignment alignBrute() {
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Map<LatticeNode, Integer> distance = new HashMap<>();
Map<LatticeNode, LatticeNode> parents = new HashMap<>();
LatticeNode sourceNode = getSourceNode();
LatticeNode targetNode = getTargetNode();
open.add(new LatticeNodeHolder(sourceNode, 0));
distance.put(sourceNode, 0);
parents.put(sourceNode, null);
Set<LatticeNode> closed = new HashSet<>();
while (true) {
LatticeNode currentNode = open.remove().getNode();
if (currentNode.equals(targetNode)) {
return tracebackPath(parents, distance.get(currentNode));
}
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
for (LatticeNode childNode : currentNode.getChildren()) {
if (closed.contains(childNode)) {
continue;
}
int tentativeCost = distance.get(currentNode) +
getWeight(currentNode, childNode);
Integer currentCost = distance.get(childNode);
if (currentCost == null || currentCost > tentativeCost) {
open.add(new LatticeNodeHolder(childNode, tentativeCost));
distance.put(childNode, tentativeCost);
parents.put(childNode, currentNode);
}
}
}
}
/**
* Create the source node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to zero.
*
* @return the source node.
*/
LatticeNode getSourceNode() {
int sourceCoordinates = new int[sequenceArray.length];
return new LatticeNode(this, sourceCoordinates);
}
/**
* Create the target node for the sequence alignment task, i.e., a lattice
* node with all coordinates set to the respective sequence lengths.
*
* @return the target node.
*/
LatticeNode getTargetNode() {
int targetCoordinates = new int[sequenceArray.length];
for (int i = 0; i != sequenceArray.length; ++i) {
targetCoordinates[i] = sequenceArray[i].length();
}
return new LatticeNode(this, targetCoordinates);
}
Integer getWeight(LatticeNode tail,
LatticeNode head,
int dimension1,
int dimension2) {
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
if (tailCoordinates[dimension1] == headCoordinates[dimension1]) {
//System.out.println("1");
return gapPenalty;
} else if (tailCoordinates[dimension2] == headCoordinates[dimension2]) {
//System.out.println("2");
return gapPenalty;
} else {
// System.out.println("3");
char character1 = sequenceArray[dimension1]
.charAt(tailCoordinates[dimension1]);
char character2 = sequenceArray[dimension2]
.charAt(tailCoordinates[dimension2]);
return costMatrix.getCost(character1, character2);
}
}
Integer getWeight(LatticeNode tail, LatticeNode head) {
// Extract the column represented by taking a single hop from 'tail' to
// 'head':
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int i = 0; i < sequenceArray.length; ++i) {
if (tailCoordinates[i] + 1 == headCoordinates[i]) {
column[i] = sequenceArray[i].charAt(tailCoordinates[i]);
} else {
column[i] = AminoAcidAlphabet.GAP_CHARACTER;
}
}
// Compute the hop cost as the sum of pairwise hops in any plane:
int cost = 0;
for (int i = 0; i < column.length; ++i) {
char character1 = column[i];
for (int j = i + 1; j < column.length; ++j) {
char character2 = column[j];
if (character1 != AminoAcidAlphabet.GAP_CHARACTER) {
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += costMatrix.getCost(character1, character2);
} else {
cost += gapPenalty;
}
} else {
// character1 IS the gap character:
if (character2 != AminoAcidAlphabet.GAP_CHARACTER) {
cost += gapPenalty;
} else {
// Do nothing since we have a pair (gap, gap).
}
}
}
}
return cost;
}
String getSequenceArray() {
return sequenceArray;
}
private Alignment tracebackPath(Map<LatticeNode, LatticeNode> parents,
Integer cost) {
List<LatticeNode> path = new ArrayList<>();
LatticeNode node = getTargetNode();
while (node != null) {
path.add(node);
node = parents.get(node);
}
Collections.<LatticeNode>reverse(path);
String strings = new String[getSequenceArray().length];
StringBuilder stringBuilders = new StringBuilder[strings.length];
for (int i = 0; i < stringBuilders.length; ++i) {
stringBuilders[i] = new StringBuilder();
}
for (int i = 1; i < path.size(); ++i) {
LatticeNode tail = path.get(i - 1);
LatticeNode head = path.get(i);
int tailCoordinates = tail.getCoordinates();
int headCoordinates = head.getCoordinates();
for (int j = 0; j < tailCoordinates.length; ++j) {
if (tailCoordinates[j] != headCoordinates[j]) {
stringBuilders[j].
append(sequenceArray[j].charAt(tailCoordinates[j]));
} else {
stringBuilders[j].append(AminoAcidAlphabet.GAP_CHARACTER);
}
}
}
for (int i = 0; i < strings.length; ++i) {
strings[i] = stringBuilders[i].toString();
}
return new Alignment(strings, cost);
}
private String checkIsValidGenomicSequence(String string) {
Set<Character> characterSet =
AminoAcidAlphabet.getAminoAcidAlphabet()
.getCharacterSet();
for (char c : string.toCharArray()) {
if (!characterSet.contains(c)) {
throw new IllegalArgumentException("Unknown amino acid: " + c);
}
}
return string;
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final Integer fScore;
private final LatticeNode node;
LatticeNodeHolder(LatticeNode node, Integer fScore) {
this.fScore = fScore;
this.node = node;
}
LatticeNode getNode() {
return node;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(fScore, o.fScore);
}
}
}
HeuristicFunctionComputer.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashSet;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
final class HeuristicFunctionComputer {
HeuristicFunction computeHeuristicFunction(
MultipleSequenceAlignmentInstance instance) {
String sequenceArray = instance.getSequenceArray();
HeuristicFunction heuristicFunction = new HeuristicFunction(instance);
for (int dimension1 = 0;
dimension1 < sequenceArray.length;
dimension1++) {
for (int dimension2 = dimension1 + 1;
dimension2 < sequenceArray.length;
dimension2++) {
loadPartialHeuristicFunction(heuristicFunction,
dimension1,
dimension2,
instance);
}
}
return heuristicFunction;
}
// Basically, this method runs Dijkstra backwards from the target node until
// the entire 2D-grid is explored.
private void loadPartialHeuristicFunction(
HeuristicFunction heuristicFunction,
int dimension1,
int dimension2,
MultipleSequenceAlignmentInstance instance) {
LatticeNode target = instance.getTargetNode();
Queue<LatticeNodeHolder> open = new PriorityQueue<>();
Set<LatticeNode> closed = new HashSet<>();
open.add(new LatticeNodeHolder(target, 0));
Point point = new Point();
while (!open.isEmpty()) {
LatticeNodeHolder currentNodeHolder = open.remove();
LatticeNode currentNode = currentNodeHolder.getLatticeNode();
if (closed.contains(currentNode)) {
continue;
}
closed.add(currentNode);
Integer currentNodeCost = currentNodeHolder.getCost();
extractPoint(point, currentNode, dimension1, dimension2);
heuristicFunction.put(dimension1,
dimension2,
new Point(point),
currentNodeCost);
for (LatticeNode parent : currentNode.getParents()) {
if (closed.contains(parent)) {
continue;
}
int tentativeCost =
currentNodeCost + instance.getWeight(parent,
currentNode,
dimension1,
dimension2);
extractPoint(point, parent, dimension1, dimension2);
if (!heuristicFunction.containsPartial(dimension1,
dimension2,
point)
|| heuristicFunction.getPartial(dimension1,
dimension2,
point)
> tentativeCost) {
open.add(new LatticeNodeHolder(parent, tentativeCost));
}
}
}
}
private static void extractPoint(Point point,
LatticeNode latticeNode,
int i,
int j) {
int coordinates = latticeNode.getCoordinates();
point.x = coordinates[i];
point.y = coordinates[j];
}
private static final class LatticeNodeHolder
implements Comparable<LatticeNodeHolder> {
private final LatticeNode node;
private final Integer cost;
LatticeNodeHolder(LatticeNode node, Integer cost) {
this.node = node;
this.cost = cost;
}
LatticeNode getLatticeNode() {
return node;
}
Integer getCost() {
return cost;
}
@Override
public int compareTo(LatticeNodeHolder o) {
return Integer.compare(cost, o.cost);
}
}
}
LatticeNode.java
package net.coderodde.bio.msa;
import com.sun.corba.se.spi.orb.OperationFactory;
import java.util.Arrays;
final class LatticeNode {
/**
* The problem instance this lattice node contributes to.
*/
private final MultipleSequenceAlignmentInstance instance;
/**
* The coordinates of this node in the entire lattice.
*/
private final int coordinates;
LatticeNode(MultipleSequenceAlignmentInstance instance, int coordinates) {
this.instance = instance;
this.coordinates = coordinates;
}
@Override
public boolean equals(Object o) {
return Arrays.equals(coordinates, ((LatticeNode) o).coordinates);
}
// Generated by NetBeans 8.1:
@Override
public int hashCode() {
int hash = 7;
hash = 41 * hash + Arrays.hashCode(this.coordinates);
return hash;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[");
String separator = "";
for (int coordinate : coordinates) {
sb.append(separator).append(coordinate);
separator = ", ";
}
return sb.append("]").toString();
}
LatticeNode getChildren() {
// Find out in how many dimension we can move forward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] < sequenceArray[i].length()) {
dimensionsNotReached++;
// We can make a step forward in the direction of ith dimension:
inclusionMap[i] = true;
}
}
// Create the array of children:
int numberOfChildren = pow2(dimensionsNotReached) - 1;
LatticeNode children = new LatticeNode[numberOfChildren];
loadChildren(children, inclusionMap);
// Convert offsets to actual child coordinates:
for (LatticeNode child : children) {
child.addOffsets(coordinates);
}
return children;
}
LatticeNode getParents() {
// Find out in how many dimensions we can move BACKward:
int dimensionsNotReached = 0;
String sequenceArray = instance.getSequenceArray();
boolean inclusionMap = new boolean[coordinates.length];
for (int i = 0; i < sequenceArray.length; ++i) {
if (coordinates[i] > 0) {
dimensionsNotReached++;
// We can make a step backwards in the direction of ith
// dimension:
inclusionMap[i] = true;
}
}
// Create the array of parents:
int numberOfParents = pow2(dimensionsNotReached) - 1;
LatticeNode parents = new LatticeNode[numberOfParents];
loadParents(parents, inclusionMap);
// Convert the offsets to actual parent coordinates:
for (LatticeNode parent : parents) {
parent.addOffsets(coordinates);
}
return parents;
}
int getCoordinates() {
return coordinates;
}
private void loadChildren(LatticeNode children, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != children.length; ++i) {
increment(coords, inclusionMap);
children[i] = new LatticeNode(instance, coords.clone());
}
}
private void loadParents(LatticeNode parents, boolean inclusionMap) {
int coords = new int[this.coordinates.length];
for (int i = 0; i != parents.length; ++i) {
decrement(coords, inclusionMap);
parents[i] = new LatticeNode(instance, coords.clone());
}
}
private static void increment(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = 1;
return;
}
coordinates[i] = 0;
}
}
private static void decrement(int coordinates, boolean inclusionMap) {
for (int i = 0; i < coordinates.length; ++i) {
if (!inclusionMap[i]) {
continue;
}
if (coordinates[i] == 0) {
coordinates[i] = -1;
return;
}
coordinates[i] = 0;
}
}
private void addOffsets(int offsets) {
for (int i = 0; i < coordinates.length; ++i) {
coordinates[i] += offsets[i];
}
}
private static int pow2(int exponent) {
int ret = 1;
for (int e = 0; e < exponent; ++e) {
ret *= 2;
}
return ret;
}
}
PAM250CostMatrix.java
package net.coderodde.bio.msa;
import java.util.HashMap;
import java.util.Map;
import net.coderodde.bio.msa.AminoAcidAlphabet;
import net.coderodde.bio.msa.CostMatrix;
public final class PAM250CostMatrix implements CostMatrix<Integer> {
private static PAM250CostMatrix instance;
private final Map<Character, Map<Character, Integer>> m = new HashMap<>();
public static PAM250CostMatrix getPAM250CostMatrix() {
if (instance == null) {
instance = new PAM250CostMatrix();
}
return instance;
}
private PAM250CostMatrix() {
AminoAcidAlphabet alphabet = AminoAcidAlphabet.getAminoAcidAlphabet();
alphabet.getCharacterSet().stream().forEach((character) -> {
m.put(character, new HashMap<>());
});
m.get('A').put('A', -2);
m.get('R').put('A', 2);
m.get('A').put('R', 2);
m.get('R').put('R', -6);
m.get('N').put('A', 0);
m.get('A').put('N', 0);
m.get('N').put('R', 0);
m.get('R').put('N', 0);
m.get('N').put('N', -2);
m.get('D').put('A', 0);
m.get('A').put('D', 0);
m.get('D').put('R', 1);
m.get('R').put('D', 1);
m.get('D').put('N', -2);
m.get('N').put('D', -2);
m.get('D').put('D', -4);
m.get('C').put('A', 2);
m.get('A').put('C', 2);
m.get('C').put('R', 4);
m.get('R').put('C', 4);
m.get('C').put('N', 4);
m.get('N').put('C', 4);
m.get('C').put('D', 5);
m.get('D').put('C', 5);
m.get('C').put('C', -4);
m.get('Q').put('A', 0);
m.get('A').put('Q', 0);
m.get('Q').put('R', -1);
m.get('R').put('Q', -1);
m.get('Q').put('N', -1);
m.get('N').put('Q', -1);
m.get('Q').put('D', -2);
m.get('D').put('Q', -2);
m.get('Q').put('C', 5);
m.get('C').put('Q', 5);
m.get('Q').put('Q', -4);
m.get('E').put('A', 0);
m.get('A').put('E', 0);
m.get('E').put('R', 1);
m.get('R').put('E', 1);
m.get('E').put('N', -1);
m.get('N').put('E', -1);
m.get('E').put('D', -3);
m.get('D').put('E', -3);
m.get('E').put('C', 5);
m.get('C').put('E', 5);
m.get('E').put('Q', -2);
m.get('Q').put('E', -2);
m.get('E').put('E', -4);
m.get('G').put('A', -1);
m.get('A').put('G', -1);
m.get('G').put('R', 3);
m.get('R').put('G', 3);
m.get('G').put('N', 0);
m.get('N').put('G', 0);
m.get('G').put('D', -1);
m.get('D').put('G', -1);
m.get('G').put('C', 3);
m.get('C').put('G', 3);
m.get('G').put('Q', 1);
m.get('Q').put('G', 1);
m.get('G').put('E', 0);
m.get('E').put('G', 0);
m.get('G').put('G', -5);
m.get('H').put('A', 1);
m.get('A').put('H', 1);
m.get('H').put('R', -2);
m.get('R').put('H', -2);
m.get('H').put('N', -2);
m.get('N').put('H', -2);
m.get('H').put('D', -1);
m.get('D').put('H', -1);
m.get('H').put('C', 3);
m.get('C').put('H', 3);
m.get('H').put('Q', -3);
m.get('Q').put('H', -3);
m.get('H').put('E', -1);
m.get('E').put('H', -1);
m.get('H').put('G', 2);
m.get('G').put('H', 2);
m.get('H').put('H', -6);
m.get('I').put('A', 1);
m.get('A').put('I', 1);
m.get('I').put('R', 2);
m.get('R').put('I', 2);
m.get('I').put('N', 2);
m.get('N').put('I', 2);
m.get('I').put('D', 2);
m.get('D').put('I', 2);
m.get('I').put('C', 2);
m.get('C').put('I', 2);
m.get('I').put('Q', 2);
m.get('Q').put('I', 2);
m.get('I').put('E', 2);
m.get('E').put('I', 2);
m.get('I').put('G', 3);
m.get('G').put('I', 3);
m.get('I').put('H', 2);
m.get('H').put('I', 2);
m.get('I').put('I', -5);
m.get('L').put('A', 2);
m.get('A').put('L', 2);
m.get('L').put('R', 3);
m.get('R').put('L', 3);
m.get('L').put('N', 3);
m.get('N').put('L', 3);
m.get('L').put('D', 4);
m.get('D').put('L', 4);
m.get('L').put('C', 6);
m.get('C').put('L', 6);
m.get('L').put('Q', 2);
m.get('Q').put('L', 2);
m.get('L').put('E', 3);
m.get('E').put('L', 3);
m.get('L').put('G', 4);
m.get('G').put('L', 4);
m.get('L').put('H', 2);
m.get('H').put('L', 2);
m.get('L').put('I', -2);
m.get('I').put('L', -2);
m.get('L').put('L', -6);
m.get('K').put('A', 1);
m.get('A').put('K', 1);
m.get('K').put('R', -3);
m.get('R').put('K', -3);
m.get('K').put('N', -1);
m.get('N').put('K', -1);
m.get('K').put('D', 0);
m.get('D').put('K', 0);
m.get('K').put('C', 5);
m.get('C').put('K', 5);
m.get('K').put('Q', -1);
m.get('Q').put('K', -1);
m.get('K').put('E', 0);
m.get('E').put('K', 0);
m.get('K').put('G', 2);
m.get('G').put('K', 2);
m.get('K').put('H', 0);
m.get('H').put('K', 0);
m.get('K').put('I', 2);
m.get('I').put('K', 2);
m.get('K').put('L', 3);
m.get('L').put('K', 3);
m.get('K').put('K', -5);
m.get('M').put('A', 1);
m.get('A').put('M', 1);
m.get('M').put('R', 0);
m.get('R').put('M', 0);
m.get('M').put('N', 2);
m.get('N').put('M', 2);
m.get('M').put('D', 3);
m.get('D').put('M', 3);
m.get('M').put('C', 5);
m.get('C').put('M', 5);
m.get('M').put('Q', 1);
m.get('Q').put('M', 1);
m.get('M').put('E', 2);
m.get('E').put('M', 2);
m.get('M').put('G', 3);
m.get('G').put('M', 3);
m.get('M').put('H', 2);
m.get('H').put('M', 2);
m.get('M').put('I', -2);
m.get('I').put('M', -2);
m.get('M').put('L', -4);
m.get('L').put('M', -4);
m.get('M').put('K', 0);
m.get('K').put('M', 0);
m.get('M').put('M', -6);
m.get('F').put('A', 4);
m.get('A').put('F', 4);
m.get('F').put('R', 4);
m.get('R').put('F', 4);
m.get('F').put('N', 4);
m.get('N').put('F', 4);
m.get('F').put('D', 6);
m.get('D').put('F', 6);
m.get('F').put('C', 4);
m.get('C').put('F', 4);
m.get('F').put('Q', 5);
m.get('Q').put('F', 5);
m.get('F').put('E', 5);
m.get('E').put('F', 5);
m.get('F').put('G', 5);
m.get('G').put('F', 5);
m.get('F').put('H', 2);
m.get('H').put('F', 2);
m.get('F').put('I', -1);
m.get('I').put('F', -1);
m.get('F').put('L', -2);
m.get('L').put('F', -2);
m.get('F').put('K', 5);
m.get('K').put('F', 5);
m.get('F').put('M', 0);
m.get('M').put('F', 0);
m.get('F').put('F', -9);
m.get('P').put('A', -1);
m.get('A').put('P', -1);
m.get('P').put('R', 0);
m.get('R').put('P', 0);
m.get('P').put('N', 1);
m.get('N').put('P', 1);
m.get('P').put('D', 1);
m.get('D').put('P', 1);
m.get('P').put('C', 3);
m.get('C').put('P', 3);
m.get('P').put('Q', 0);
m.get('Q').put('P', 0);
m.get('P').put('E', 1);
m.get('E').put('P', 1);
m.get('P').put('G', 1);
m.get('G').put('P', 1);
m.get('P').put('H', 0);
m.get('H').put('P', 0);
m.get('P').put('I', 2);
m.get('I').put('P', 2);
m.get('P').put('L', 3);
m.get('L').put('P', 3);
m.get('P').put('K', 1);
m.get('K').put('P', 1);
m.get('P').put('M', 2);
m.get('M').put('P', 2);
m.get('P').put('F', 5);
m.get('F').put('P', 5);
m.get('P').put('P', -6);
m.get('S').put('A', -1);
m.get('A').put('S', -1);
m.get('S').put('R', 0);
m.get('R').put('S', 0);
m.get('S').put('N', -1);
m.get('N').put('S', -1);
m.get('S').put('D', 0);
m.get('D').put('S', 0);
m.get('S').put('C', 0);
m.get('C').put('S', 0);
m.get('S').put('Q', 1);
m.get('Q').put('S', 1);
m.get('S').put('E', 0);
m.get('E').put('S', 0);
m.get('S').put('G', -1);
m.get('G').put('S', -1);
m.get('S').put('H', 1);
m.get('H').put('S', 1);
m.get('S').put('I', 1);
m.get('I').put('S', 1);
m.get('S').put('L', 3);
m.get('L').put('S', 3);
m.get('S').put('K', 0);
m.get('K').put('S', 0);
m.get('S').put('M', 2);
m.get('M').put('S', 2);
m.get('S').put('F', 3);
m.get('F').put('S', 3);
m.get('S').put('P', -1);
m.get('P').put('S', -1);
m.get('S').put('S', -3);
m.get('T').put('A', -1);
m.get('A').put('T', -1);
m.get('T').put('R', 1);
m.get('R').put('T', 1);
m.get('T').put('N', 0);
m.get('N').put('T', 0);
m.get('T').put('D', 0);
m.get('D').put('T', 0);
m.get('T').put('C', 2);
m.get('C').put('T', 2);
m.get('T').put('Q', 1);
m.get('Q').put('T', 1);
m.get('T').put('E', 0);
m.get('E').put('T', 0);
m.get('T').put('G', 0);
m.get('G').put('T', 0);
m.get('T').put('H', 1);
m.get('H').put('T', 1);
m.get('T').put('I', 0);
m.get('I').put('T', 0);
m.get('T').put('L', 2);
m.get('L').put('T', 2);
m.get('T').put('K', 0);
m.get('K').put('T', 0);
m.get('T').put('M', 1);
m.get('M').put('T', 1);
m.get('T').put('F', 2);
m.get('F').put('T', 2);
m.get('T').put('P', 0);
m.get('P').put('T', 0);
m.get('T').put('S', -1);
m.get('S').put('T', -1);
m.get('T').put('T', -3);
m.get('W').put('A', 6);
m.get('A').put('W', 6);
m.get('W').put('R', -2);
m.get('R').put('W', -2);
m.get('W').put('N', 4);
m.get('N').put('W', 4);
m.get('W').put('D', 7);
m.get('D').put('W', 7);
m.get('W').put('C', 8);
m.get('C').put('W', 8);
m.get('W').put('Q', 5);
m.get('Q').put('W', 5);
m.get('W').put('E', 7);
m.get('E').put('W', 7);
m.get('W').put('G', 7);
m.get('G').put('W', 7);
m.get('W').put('H', 3);
m.get('H').put('W', 3);
m.get('W').put('I', 5);
m.get('I').put('W', 5);
m.get('W').put('L', 2);
m.get('L').put('W', 2);
m.get('W').put('K', 3);
m.get('K').put('W', 3);
m.get('W').put('M', 4);
m.get('M').put('W', 4);
m.get('W').put('F', 0);
m.get('F').put('W', 0);
m.get('W').put('P', 6);
m.get('P').put('W', 6);
m.get('W').put('S', 2);
m.get('S').put('W', 2);
m.get('W').put('T', 5);
m.get('T').put('W', 5);
m.get('W').put('W', -17);
m.get('Y').put('A', 3);
m.get('A').put('Y', 3);
m.get('Y').put('R', 4);
m.get('R').put('Y', 4);
m.get('Y').put('N', 2);
m.get('N').put('Y', 2);
m.get('Y').put('D', 4);
m.get('D').put('Y', 4);
m.get('Y').put('C', 0);
m.get('C').put('Y', 0);
m.get('Y').put('Q', 4);
m.get('Q').put('Y', 4);
m.get('Y').put('E', 4);
m.get('E').put('Y', 4);
m.get('Y').put('G', 5);
m.get('G').put('Y', 5);
m.get('Y').put('H', 0);
m.get('H').put('Y', 0);
m.get('Y').put('I', 1);
m.get('I').put('Y', 1);
m.get('Y').put('L', 1);
m.get('L').put('Y', 1);
m.get('Y').put('K', 4);
m.get('K').put('Y', 4);
m.get('Y').put('M', 2);
m.get('M').put('Y', 2);
m.get('Y').put('F', -7);
m.get('F').put('Y', -7);
m.get('Y').put('P', 5);
m.get('P').put('Y', 5);
m.get('Y').put('S', 3);
m.get('S').put('Y', 3);
m.get('Y').put('T', 3);
m.get('T').put('Y', 3);
m.get('Y').put('W', 0);
m.get('W').put('Y', 0);
m.get('Y').put('Y', -10);
m.get('V').put('A', 0);
m.get('A').put('V', 0);
m.get('V').put('R', 2);
m.get('R').put('V', 2);
m.get('V').put('N', 2);
m.get('N').put('V', 2);
m.get('V').put('D', 2);
m.get('D').put('V', 2);
m.get('V').put('C', 2);
m.get('C').put('V', 2);
m.get('V').put('Q', 2);
m.get('Q').put('V', 2);
m.get('V').put('E', 2);
m.get('E').put('V', 2);
m.get('V').put('G', 1);
m.get('G').put('V', 1);
m.get('V').put('H', 2);
m.get('H').put('V', 2);
m.get('V').put('I', -4);
m.get('I').put('V', -4);
m.get('V').put('L', -2);
m.get('L').put('V', -2);
m.get('V').put('K', 2);
m.get('K').put('V', 2);
m.get('V').put('M', -2);
m.get('M').put('V', -2);
m.get('V').put('F', 1);
m.get('F').put('V', 1);
m.get('V').put('P', 1);
m.get('P').put('V', 1);
m.get('V').put('S', 1);
m.get('S').put('V', 1);
m.get('V').put('T', 0);
m.get('T').put('V', 0);
m.get('V').put('W', 6);
m.get('W').put('V', 6);
m.get('V').put('Y', 2);
m.get('Y').put('V', 2);
m.get('V').put('V', -4);
}
@Override
public Integer getCost(Character aminoAcidChar1, Character aminoAcidChar2) {
try {
return m.get(aminoAcidChar1).get(aminoAcidChar2);
} catch (NullPointerException ex) {
throw new IllegalArgumentException("Bad arguments: (" +
aminoAcidChar1 + ", " + aminoAcidChar2 + ")",
ex);
}
}
}
AminoAcidAlphabet.java
package net.coderodde.bio.msa;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
public final class AminoAcidAlphabet {
public static final Character GAP_CHARACTER = '-';
private static AminoAcidAlphabet instance;
private final Set<Character> alphabet = new HashSet<>();
public static AminoAcidAlphabet getAminoAcidAlphabet() {
if (instance == null) {
instance = new AminoAcidAlphabet();
}
return instance;
}
private AminoAcidAlphabet() {
String aminoAcids = "ACDEF" +
"GHIKL" +
"MNPQR" +
"STVWY";
for (char c : aminoAcids.toCharArray()) {
alphabet.add(c);
}
}
public Set<Character> getCharacterSet() {
return Collections.<Character>unmodifiableSet(alphabet);
}
}
Alignment.java
package net.coderodde.bio.msa;
public class Alignment {
private final String alignment;
private final int cost;
Alignment(String alignment, int cost) {
this.alignment = alignment;
this.cost = cost;
}
public String getAlignemnt() {
return alignment;
}
public int getCost() {
return cost;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
String separator = "";
for (String row : alignment) {
sb.append(separator).append(row);
separator = "n";
}
sb.append("nCost: ").append(cost);
return sb.toString();
}
}
CostMatrix.java
package net.coderodde.bio.msa;
public interface CostMatrix<C> {
public C getCost(Character aminoAcidChar1, Character aminoAcidChar2);
}
HeuristicFunction.java
package net.coderodde.bio.msa;
import java.awt.Point;
import java.util.HashMap;
import java.util.Map;
final class HeuristicFunction {
private final Map<Point, Integer> matrix;
HeuristicFunction(MultipleSequenceAlignmentInstance instance) {
int sequences = instance.getSequenceArray().length;
this.matrix = new Map[sequences];
for (int i = 0; i < sequences; ++i) {
this.matrix[i] = new Map[sequences];
for (int j = i + 1; j < sequences; ++j) {
this.matrix[i][j] = new HashMap<>();
}
}
}
void put(int dimension1, int dimension2, Point point, Integer cost) {
matrix[dimension1][dimension2].put(point, cost);
}
boolean containsPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].containsKey(point);
}
Integer getPartial(int dimension1, int dimension2, Point point) {
return matrix[dimension1][dimension2].get(point);
}
Integer get(LatticeNode node) {
int cost = 0;
int coordinates = node.getCoordinates();
Point point = new Point();
for (int dimension1 = 0;
dimension1 < coordinates.length;
dimension1++) {
point.x = dimension1;
for (int dimension2 = dimension1 + 1;
dimension2 < coordinates.length;
dimension2++) {
point.y = dimension2;
cost += matrix[dimension1][dimension2].get(point);
}
}
return cost;
}
}
App.java
package net.coderodde.bio.msa;
final class App {
private static final String SEQUENCES = {
"ACGHKGMNPFQEKKFKLMNRW",
"CFGPQWYRTLMEKKFKNR",
"EACLMNRPQWTR",
"TIMWAYHTMGIEKKFK"
};
public static void main(String args) {
MultipleSequenceAlignmentInstance instance =
new MultipleSequenceAlignmentInstance(
PAM250CostMatrix.getPAM250CostMatrix(),
4,
SEQUENCES);
long start = System.currentTimeMillis();
Alignment alignment1 = instance.align();
long end = System.currentTimeMillis();
System.out.println(alignment1);
System.out.println(end - start + " ms.");
System.out.println();
start = System.currentTimeMillis();
Alignment alignment2 = instance.alignBrute();
end = System.currentTimeMillis();
System.out.println(alignment2);
System.out.println(end - start + " ms.");
}
}
Output
The output for the demo is
Computed heuristic function in 1003 milliseconds.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
1126 ms.
-ACG-HKGMNPF-QEKKFKLMNRW
-CFGPQW-YRTL-MEKKFK--NR-
EAC--LM-NRP--Q-WT-R-----
-TI--MWAYHTMGIEKKFK-----
Cost: 53
60 ms.
Critique request
As always, please tell me anything that comes to mind.
java algorithm bioinformatics
java algorithm bioinformatics
asked Apr 16 '17 at 15:05
coderoddecoderodde
15.7k537125
15.7k537125
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
In AminoAcidAlphabet
:
getCharacterSet
always return same set, but you always create new instance.
In PAM250CostMatrix
- in constructor you can extract method to write
add('A', 'A', -2)
instead ofm.get('A').put('A', -2)
In LatticeNode
- you can make
hashCode
simply doreturn Arrays.hashCode(this.coordinates);
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f160915%2fmultiple-sequence-alignment-in-java%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
In AminoAcidAlphabet
:
getCharacterSet
always return same set, but you always create new instance.
In PAM250CostMatrix
- in constructor you can extract method to write
add('A', 'A', -2)
instead ofm.get('A').put('A', -2)
In LatticeNode
- you can make
hashCode
simply doreturn Arrays.hashCode(this.coordinates);
add a comment |
In AminoAcidAlphabet
:
getCharacterSet
always return same set, but you always create new instance.
In PAM250CostMatrix
- in constructor you can extract method to write
add('A', 'A', -2)
instead ofm.get('A').put('A', -2)
In LatticeNode
- you can make
hashCode
simply doreturn Arrays.hashCode(this.coordinates);
add a comment |
In AminoAcidAlphabet
:
getCharacterSet
always return same set, but you always create new instance.
In PAM250CostMatrix
- in constructor you can extract method to write
add('A', 'A', -2)
instead ofm.get('A').put('A', -2)
In LatticeNode
- you can make
hashCode
simply doreturn Arrays.hashCode(this.coordinates);
In AminoAcidAlphabet
:
getCharacterSet
always return same set, but you always create new instance.
In PAM250CostMatrix
- in constructor you can extract method to write
add('A', 'A', -2)
instead ofm.get('A').put('A', -2)
In LatticeNode
- you can make
hashCode
simply doreturn Arrays.hashCode(this.coordinates);
answered 17 hours ago
talextalex
1764
1764
add a comment |
add a comment |
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f160915%2fmultiple-sequence-alignment-in-java%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown