diff options
Diffstat (limited to 'google/genomics/v1/readalignment.proto')
-rw-r--r-- | google/genomics/v1/readalignment.proto | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/google/genomics/v1/readalignment.proto b/google/genomics/v1/readalignment.proto new file mode 100644 index 000000000..1eb464034 --- /dev/null +++ b/google/genomics/v1/readalignment.proto @@ -0,0 +1,220 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/genomics/v1/cigar.proto"; +import "google/genomics/v1/position.proto"; +import "google/protobuf/struct.proto"; + +option cc_enable_arenas = true; +option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics"; +option java_multiple_files = true; +option java_outer_classname = "ReadAlignmentProto"; +option java_package = "com.google.genomics.v1"; + +// A linear alignment can be represented by one CIGAR string. Describes the +// mapped position and local alignment of the read to the reference. +message LinearAlignment { + // The position of this alignment. + Position position = 1; + + // The mapping quality of this alignment. Represents how likely + // the read maps to this position as opposed to other locations. + // + // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to + // the nearest integer. + int32 mapping_quality = 2; + + // Represents the local alignment of this sequence (alignment matches, indels, + // etc) against the reference. + repeated CigarUnit cigar = 3; +} + +// A read alignment describes a linear alignment of a string of DNA to a +// [reference sequence][google.genomics.v1.Reference], in addition to metadata +// about the fragment (the molecule of DNA sequenced) and the read (the bases +// which were read by the sequencer). A read is equivalent to a line in a SAM +// file. A read belongs to exactly one read group and exactly one +// [read group set][google.genomics.v1.ReadGroupSet]. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +// +// ### Reverse-stranded reads +// +// Mapped reads (reads having a non-null `alignment`) can be aligned to either +// the forward or the reverse strand of their associated reference. Strandedness +// of a mapped read is encoded by `alignment.position.reverseStrand`. +// +// If we consider the reference to be a forward-stranded coordinate space of +// `[0, reference.length)` with `0` as the left-most position and +// `reference.length` as the right-most position, reads are always aligned left +// to right. That is, `alignment.position.position` always refers to the +// left-most reference coordinate and `alignment.cigar` describes the alignment +// of this read to the reference from left to right. All per-base fields such as +// `alignedSequence` and `alignedQuality` share this same left-to-right +// orientation; this is true of reads which are aligned to either strand. For +// reverse-stranded reads, this means that `alignedSequence` is the reverse +// complement of the bases that were originally reported by the sequencing +// machine. +// +// ### Generating a reference-aligned sequence string +// +// When interacting with mapped reads, it's often useful to produce a string +// representing the local alignment of the read to reference. The following +// pseudocode demonstrates one way of doing this: +// +// out = "" +// offset = 0 +// for c in read.alignment.cigar { +// switch c.operation { +// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH": +// out += read.alignedSequence[offset:offset+c.operationLength] +// offset += c.operationLength +// break +// case "CLIP_SOFT", "INSERT": +// offset += c.operationLength +// break +// case "PAD": +// out += repeat("*", c.operationLength) +// break +// case "DELETE": +// out += repeat("-", c.operationLength) +// break +// case "SKIP": +// out += repeat(" ", c.operationLength) +// break +// case "CLIP_HARD": +// break +// } +// } +// return out +// +// ### Converting to SAM's CIGAR string +// +// The following pseudocode generates a SAM CIGAR string from the +// `cigar` field. Note that this is a lossy conversion +// (`cigar.referenceSequence` is lost). +// +// cigarMap = { +// "ALIGNMENT_MATCH": "M", +// "INSERT": "I", +// "DELETE": "D", +// "SKIP": "N", +// "CLIP_SOFT": "S", +// "CLIP_HARD": "H", +// "PAD": "P", +// "SEQUENCE_MATCH": "=", +// "SEQUENCE_MISMATCH": "X", +// } +// cigarStr = "" +// for c in read.alignment.cigar { +// cigarStr += c.operationLength + cigarMap[c.operation] +// } +// return cigarStr +message Read { + // The server-generated read ID, unique across all reads. This is different + // from the `fragmentName`. + string id = 1; + + // The ID of the read group this read belongs to. A read belongs to exactly + // one read group. This is a server-generated ID which is distinct from SAM's + // RG tag (for that value, see + // [ReadGroup.name][google.genomics.v1.ReadGroup.name]). + string read_group_id = 2; + + // The ID of the read group set this read belongs to. A read belongs to + // exactly one read group set. + string read_group_set_id = 3; + + // The fragment name. Equivalent to QNAME (query template name) in SAM. + string fragment_name = 4; + + // The orientation and the distance between reads from the fragment are + // consistent with the sequencing protocol (SAM flag 0x2). + bool proper_placement = 5; + + // The fragment is a PCR or optical duplicate (SAM flag 0x400). + bool duplicate_fragment = 6; + + // The observed length of the fragment, equivalent to TLEN in SAM. + int32 fragment_length = 7; + + // The read number in sequencing. 0-based and less than numberReads. This + // field replaces SAM flag 0x40 and 0x80. + int32 read_number = 8; + + // The number of reads in the fragment (extension to SAM flag 0x1). + int32 number_reads = 9; + + // Whether this read did not pass filters, such as platform or vendor quality + // controls (SAM flag 0x200). + bool failed_vendor_quality_checks = 10; + + // The linear alignment for this alignment record. This field is null for + // unmapped reads. + LinearAlignment alignment = 11; + + // Whether this alignment is secondary. Equivalent to SAM flag 0x100. + // A secondary alignment represents an alternative to the primary alignment + // for this read. Aligners may return secondary alignments if a read can map + // ambiguously to multiple coordinates in the genome. By convention, each read + // has one and only one alignment where both `secondaryAlignment` + // and `supplementaryAlignment` are false. + bool secondary_alignment = 12; + + // Whether this alignment is supplementary. Equivalent to SAM flag 0x800. + // Supplementary alignments are used in the representation of a chimeric + // alignment. In a chimeric alignment, a read is split into multiple + // linear alignments that map to different reference contigs. The first + // linear alignment in the read will be designated as the representative + // alignment; the remaining linear alignments will be designated as + // supplementary alignments. These alignments may have different mapping + // quality scores. In each linear alignment in a chimeric alignment, the read + // will be hard clipped. The `alignedSequence` and + // `alignedQuality` fields in the alignment record will only + // represent the bases for its respective linear alignment. + bool supplementary_alignment = 13; + + // The bases of the read sequence contained in this alignment record, + // **without CIGAR operations applied** (equivalent to SEQ in SAM). + // `alignedSequence` and `alignedQuality` may be + // shorter than the full read sequence and quality. This will occur if the + // alignment is part of a chimeric alignment, or if the read was trimmed. When + // this occurs, the CIGAR for this read will begin/end with a hard clip + // operator that will indicate the length of the excised sequence. + string aligned_sequence = 14; + + // The quality of the read sequence contained in this alignment record + // (equivalent to QUAL in SAM). + // `alignedSequence` and `alignedQuality` may be shorter than the full read + // sequence and quality. This will occur if the alignment is part of a + // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR + // for this read will begin/end with a hard clip operator that will indicate + // the length of the excised sequence. + repeated int32 aligned_quality = 15; + + // The mapping of the primary alignment of the + // `(readNumber+1)%numberReads` read in the fragment. It replaces + // mate position and mate strand in SAM. + Position next_mate_position = 16; + + // A map of additional read alignment information. This must be of the form + // map<string, string[]> (string key mapping to a list of string values). + map<string, google.protobuf.ListValue> info = 17; +} |