aboutsummaryrefslogtreecommitdiff
path: root/google/genomics/v1/readalignment.proto
diff options
context:
space:
mode:
Diffstat (limited to 'google/genomics/v1/readalignment.proto')
-rw-r--r--google/genomics/v1/readalignment.proto220
1 files changed, 220 insertions, 0 deletions
diff --git a/google/genomics/v1/readalignment.proto b/google/genomics/v1/readalignment.proto
new file mode 100644
index 000000000..1eb464034
--- /dev/null
+++ b/google/genomics/v1/readalignment.proto
@@ -0,0 +1,220 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package google.genomics.v1;
+
+import "google/api/annotations.proto";
+import "google/genomics/v1/cigar.proto";
+import "google/genomics/v1/position.proto";
+import "google/protobuf/struct.proto";
+
+option cc_enable_arenas = true;
+option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
+option java_multiple_files = true;
+option java_outer_classname = "ReadAlignmentProto";
+option java_package = "com.google.genomics.v1";
+
+// A linear alignment can be represented by one CIGAR string. Describes the
+// mapped position and local alignment of the read to the reference.
+message LinearAlignment {
+ // The position of this alignment.
+ Position position = 1;
+
+ // The mapping quality of this alignment. Represents how likely
+ // the read maps to this position as opposed to other locations.
+ //
+ // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
+ // the nearest integer.
+ int32 mapping_quality = 2;
+
+ // Represents the local alignment of this sequence (alignment matches, indels,
+ // etc) against the reference.
+ repeated CigarUnit cigar = 3;
+}
+
+// A read alignment describes a linear alignment of a string of DNA to a
+// [reference sequence][google.genomics.v1.Reference], in addition to metadata
+// about the fragment (the molecule of DNA sequenced) and the read (the bases
+// which were read by the sequencer). A read is equivalent to a line in a SAM
+// file. A read belongs to exactly one read group and exactly one
+// [read group set][google.genomics.v1.ReadGroupSet].
+//
+// For more genomics resource definitions, see [Fundamentals of Google
+// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
+//
+// ### Reverse-stranded reads
+//
+// Mapped reads (reads having a non-null `alignment`) can be aligned to either
+// the forward or the reverse strand of their associated reference. Strandedness
+// of a mapped read is encoded by `alignment.position.reverseStrand`.
+//
+// If we consider the reference to be a forward-stranded coordinate space of
+// `[0, reference.length)` with `0` as the left-most position and
+// `reference.length` as the right-most position, reads are always aligned left
+// to right. That is, `alignment.position.position` always refers to the
+// left-most reference coordinate and `alignment.cigar` describes the alignment
+// of this read to the reference from left to right. All per-base fields such as
+// `alignedSequence` and `alignedQuality` share this same left-to-right
+// orientation; this is true of reads which are aligned to either strand. For
+// reverse-stranded reads, this means that `alignedSequence` is the reverse
+// complement of the bases that were originally reported by the sequencing
+// machine.
+//
+// ### Generating a reference-aligned sequence string
+//
+// When interacting with mapped reads, it's often useful to produce a string
+// representing the local alignment of the read to reference. The following
+// pseudocode demonstrates one way of doing this:
+//
+// out = ""
+// offset = 0
+// for c in read.alignment.cigar {
+// switch c.operation {
+// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
+// out += read.alignedSequence[offset:offset+c.operationLength]
+// offset += c.operationLength
+// break
+// case "CLIP_SOFT", "INSERT":
+// offset += c.operationLength
+// break
+// case "PAD":
+// out += repeat("*", c.operationLength)
+// break
+// case "DELETE":
+// out += repeat("-", c.operationLength)
+// break
+// case "SKIP":
+// out += repeat(" ", c.operationLength)
+// break
+// case "CLIP_HARD":
+// break
+// }
+// }
+// return out
+//
+// ### Converting to SAM's CIGAR string
+//
+// The following pseudocode generates a SAM CIGAR string from the
+// `cigar` field. Note that this is a lossy conversion
+// (`cigar.referenceSequence` is lost).
+//
+// cigarMap = {
+// "ALIGNMENT_MATCH": "M",
+// "INSERT": "I",
+// "DELETE": "D",
+// "SKIP": "N",
+// "CLIP_SOFT": "S",
+// "CLIP_HARD": "H",
+// "PAD": "P",
+// "SEQUENCE_MATCH": "=",
+// "SEQUENCE_MISMATCH": "X",
+// }
+// cigarStr = ""
+// for c in read.alignment.cigar {
+// cigarStr += c.operationLength + cigarMap[c.operation]
+// }
+// return cigarStr
+message Read {
+ // The server-generated read ID, unique across all reads. This is different
+ // from the `fragmentName`.
+ string id = 1;
+
+ // The ID of the read group this read belongs to. A read belongs to exactly
+ // one read group. This is a server-generated ID which is distinct from SAM's
+ // RG tag (for that value, see
+ // [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
+ string read_group_id = 2;
+
+ // The ID of the read group set this read belongs to. A read belongs to
+ // exactly one read group set.
+ string read_group_set_id = 3;
+
+ // The fragment name. Equivalent to QNAME (query template name) in SAM.
+ string fragment_name = 4;
+
+ // The orientation and the distance between reads from the fragment are
+ // consistent with the sequencing protocol (SAM flag 0x2).
+ bool proper_placement = 5;
+
+ // The fragment is a PCR or optical duplicate (SAM flag 0x400).
+ bool duplicate_fragment = 6;
+
+ // The observed length of the fragment, equivalent to TLEN in SAM.
+ int32 fragment_length = 7;
+
+ // The read number in sequencing. 0-based and less than numberReads. This
+ // field replaces SAM flag 0x40 and 0x80.
+ int32 read_number = 8;
+
+ // The number of reads in the fragment (extension to SAM flag 0x1).
+ int32 number_reads = 9;
+
+ // Whether this read did not pass filters, such as platform or vendor quality
+ // controls (SAM flag 0x200).
+ bool failed_vendor_quality_checks = 10;
+
+ // The linear alignment for this alignment record. This field is null for
+ // unmapped reads.
+ LinearAlignment alignment = 11;
+
+ // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
+ // A secondary alignment represents an alternative to the primary alignment
+ // for this read. Aligners may return secondary alignments if a read can map
+ // ambiguously to multiple coordinates in the genome. By convention, each read
+ // has one and only one alignment where both `secondaryAlignment`
+ // and `supplementaryAlignment` are false.
+ bool secondary_alignment = 12;
+
+ // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
+ // Supplementary alignments are used in the representation of a chimeric
+ // alignment. In a chimeric alignment, a read is split into multiple
+ // linear alignments that map to different reference contigs. The first
+ // linear alignment in the read will be designated as the representative
+ // alignment; the remaining linear alignments will be designated as
+ // supplementary alignments. These alignments may have different mapping
+ // quality scores. In each linear alignment in a chimeric alignment, the read
+ // will be hard clipped. The `alignedSequence` and
+ // `alignedQuality` fields in the alignment record will only
+ // represent the bases for its respective linear alignment.
+ bool supplementary_alignment = 13;
+
+ // The bases of the read sequence contained in this alignment record,
+ // **without CIGAR operations applied** (equivalent to SEQ in SAM).
+ // `alignedSequence` and `alignedQuality` may be
+ // shorter than the full read sequence and quality. This will occur if the
+ // alignment is part of a chimeric alignment, or if the read was trimmed. When
+ // this occurs, the CIGAR for this read will begin/end with a hard clip
+ // operator that will indicate the length of the excised sequence.
+ string aligned_sequence = 14;
+
+ // The quality of the read sequence contained in this alignment record
+ // (equivalent to QUAL in SAM).
+ // `alignedSequence` and `alignedQuality` may be shorter than the full read
+ // sequence and quality. This will occur if the alignment is part of a
+ // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
+ // for this read will begin/end with a hard clip operator that will indicate
+ // the length of the excised sequence.
+ repeated int32 aligned_quality = 15;
+
+ // The mapping of the primary alignment of the
+ // `(readNumber+1)%numberReads` read in the fragment. It replaces
+ // mate position and mate strand in SAM.
+ Position next_mate_position = 16;
+
+ // A map of additional read alignment information. This must be of the form
+ // map<string, string[]> (string key mapping to a list of string values).
+ map<string, google.protobuf.ListValue> info = 17;
+}