|
| 1 | +namespace BioFSharp.Stats |
| 2 | + |
| 3 | +open System |
| 4 | +open FSharpAux |
| 5 | + |
| 6 | +module OntologyEnrichment = |
| 7 | + |
| 8 | + /// Represents an item in an ontology set |
| 9 | + type OntologyItem<'a> = { |
| 10 | + Id : string |
| 11 | + OntologyTerm : string |
| 12 | + GroupIndex : int |
| 13 | + Item : 'a |
| 14 | + } |
| 15 | + |
| 16 | + |
| 17 | + /// Creates an item in an ontology set |
| 18 | + let createOntologyItem id ontologyTerm groupIndex item = |
| 19 | + {Id = id; OntologyTerm = ontologyTerm; GroupIndex = groupIndex; Item = item} |
| 20 | + |
| 21 | + /// Represents a gene set enrichment result |
| 22 | + type GseaResult<'a> = { |
| 23 | + ///Ontology term e.g. MapMan term, GO term ... |
| 24 | + OntologyTerm : string |
| 25 | + ///Sequence of single items associated to the ontology term |
| 26 | + ItemsInBin : seq<OntologyItem<'a>> |
| 27 | + ///Number of significantly altered items in 'OntologyTerm' bin |
| 28 | + NumberOfDEsInBin : int |
| 29 | + ///Number of items in 'OntologyTerm' bin |
| 30 | + NumberInBin : int |
| 31 | + ///Number of significantly altered items within the total data set |
| 32 | + TotalNumberOfDE : int |
| 33 | + ///Number of all items (expanded) |
| 34 | + TotalUniverse : int |
| 35 | + PValue : float |
| 36 | + } |
| 37 | + |
| 38 | + /// Creates a gene set enrichment result |
| 39 | + let createGseaResult ontologyTerm desInBin numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue = |
| 40 | + {OntologyTerm = ontologyTerm;ItemsInBin = desInBin; NumberOfDEsInBin = numberOfDEsInBin; |
| 41 | + NumberInBin = numberInBin; TotalNumberOfDE = totalNumberOfDE; TotalUniverse = totalUnivers; PValue = pValue} |
| 42 | + |
| 43 | + ///Splits an OntologyEntry with seperator concatenated TermIds |
| 44 | + let splitMultipleAnnotationsBy (separator:char) (item:OntologyItem<'A>) = |
| 45 | + let annotations = item.OntologyTerm.Split(separator) |
| 46 | + annotations |
| 47 | + |> Seq.map (fun ot -> {item with OntologyTerm = ot}) |
| 48 | + |
| 49 | + /// Splits MapMan OntologyEntries with seperator concatenated TermIds |
| 50 | + /// Attention: Also parses string to int to get rid of 0 - terms |
| 51 | + let splitMapManOntologyItemsBy (separator:char) (data:seq<OntologyItem<'a>>) = |
| 52 | + let splitTerm (termId:string) (separator:char) = |
| 53 | + termId.Split(separator) |
| 54 | + |> Array.map (fun sTerm -> |
| 55 | + let splited = sTerm.Split('.') |
| 56 | + let toInt = splited |> Seq.map (fun v -> Int32.Parse(v).ToString()) |
| 57 | + toInt |> String.concat "." |
| 58 | + ) |
| 59 | + data |
| 60 | + |> Seq.collect (fun oi -> |
| 61 | + splitTerm oi.OntologyTerm separator |
| 62 | + |> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item) |
| 63 | + ) |
| 64 | + |
| 65 | + |
| 66 | + /// Extends leaf OntologyEntries to their full tree |
| 67 | + let expandOntologyTree (data:seq<OntologyItem<'a>>) = |
| 68 | + data |
| 69 | + |> Seq.collect (fun oi -> |
| 70 | + let expandenTermIds = oi.OntologyTerm.Split('.') |> Array.scanReduce (fun acc elem -> acc + "." + elem) |
| 71 | + expandenTermIds |> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item) |
| 72 | + ) |
| 73 | + |
| 74 | + |
| 75 | + // ########################################################################################################### |
| 76 | + // the hypergeometric distribution is a discrete probability distribution that describes the probability of |
| 77 | + // k successes in |
| 78 | + // n draws from a finite |
| 79 | + // x population of size containing |
| 80 | + // m successes without replacement (successes states) |
| 81 | + /// Calculates p value based on hypergeometric distribution (pValue <= k) |
| 82 | + let CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE (splitPvalueThreshold:int) = |
| 83 | + if (numberOfDEsInBin > 1) then |
| 84 | + let hp = FSharp.Stats.Distributions.Discrete.Hypergeometric.Init totalUnivers totalNumberOfDE numberInBin |
| 85 | + if numberInBin > splitPvalueThreshold then |
| 86 | + // Calculate normal pValue |
| 87 | + 1. - hp.CDF (float (numberOfDEsInBin - 1)) |
| 88 | + else |
| 89 | + // Calculate split pValue |
| 90 | + 0.5 * ((1. - hp.CDF(float(numberOfDEsInBin - 1)) ) + ( (1. - hp.CDF(float(numberOfDEsInBin))) ) ) |
| 91 | + else |
| 92 | + nan |
| 93 | + |
| 94 | + |
| 95 | + // ####################################################### |
| 96 | + // functional term enrichment is calculated according to following publication |
| 97 | + // http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401 |
| 98 | + // also includes mid-pValues |
| 99 | + /// Calculates functional term enrichment |
| 100 | + let CalcSimpleOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option<int>) (data:seq<OntologyItem<'a>>) = |
| 101 | + let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5 |
| 102 | + |
| 103 | + let totalUnivers = data |> Seq.length |
| 104 | + let totalNumberOfDE = data |> Seq.filter (fun oi -> oi.GroupIndex = deGroupIndex) |> Seq.length |
| 105 | + |
| 106 | + // returns (DE count, all count) |
| 107 | + let countDE (subSet:seq<OntologyItem<'a>>) = |
| 108 | + let countMap = |
| 109 | + subSet |
| 110 | + |> Seq.countBy (fun oi -> if oi.GroupIndex = deGroupIndex then true else false ) |
| 111 | + |> Map.ofSeq |
| 112 | + (countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false)) |
| 113 | + |
| 114 | + data |
| 115 | + |> Seq.groupBy ( fun oi -> oi.OntologyTerm) |
| 116 | + |> Seq.map (fun (oTerm,values) -> |
| 117 | + let numberOfDEsInBin,numberInBin = countDE values |
| 118 | + let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold |
| 119 | + createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue) |
| 120 | + |
| 121 | + |
| 122 | + // ####################################################### |
| 123 | + // functional term enrichment is calculated according to following publication |
| 124 | + // http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401 |
| 125 | + // also includes mid-pValues |
| 126 | + /// Calculates functional term enrichment |
| 127 | + let CalcOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option<int>) (minNumberInTerm:option<int>) (data:seq<OntologyItem<'a>>) = |
| 128 | + let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5 |
| 129 | + let _minNumberInTerm = defaultArg minNumberInTerm 2 |
| 130 | + |
| 131 | + // Distinct by term and gene name |
| 132 | + // Has to be done by an ouside function |
| 133 | + //let distinctData = data |> Seq.distinctBy (fun o -> o.displayID) |
| 134 | + let gData = data |> Seq.groupBy ( fun o -> o.OntologyTerm) |
| 135 | + // reduce to terms at least annotated with 2 items |
| 136 | + let fData = gData |> Seq.filter ( fun (key:string,values:seq<OntologyItem<'a>>) -> Seq.length(values) >= _minNumberInTerm) |
| 137 | + let groupCount = fData |> Seq.collect (fun (key:string,values:seq<OntologyItem<'a>>) -> values ) |> Seq.countBy (fun o -> o.GroupIndex) |
| 138 | + |
| 139 | + let totalUnivers = groupCount |> Seq.fold (fun (acc:int) (index:int,count:int) -> acc + count) 0 |
| 140 | + let totalNumberOfDE = |
| 141 | + let tmp = groupCount |> Seq.tryFind (fun (key,v) -> key = deGroupIndex) |
| 142 | + if tmp.IsNone then |
| 143 | + raise (System.ArgumentException("DE group index does not exists in ontology entry")) |
| 144 | + else |
| 145 | + snd(tmp.Value) |
| 146 | + |
| 147 | + // returns (DE count, all count) |
| 148 | + let countDE (subSet:seq<OntologyItem<'a>>) = |
| 149 | + let countMap = |
| 150 | + subSet |
| 151 | + |> Seq.countBy (fun (oi) -> oi.GroupIndex = deGroupIndex) |
| 152 | + |> Map.ofSeq |
| 153 | + (countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false)) |
| 154 | + |
| 155 | + fData |
| 156 | + |> Seq.map (fun (oTerm,values) -> |
| 157 | + let numberOfDEsInBin,numberInBin = countDE values |
| 158 | + let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold |
| 159 | + createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue) |
| 160 | + |
| 161 | + |
0 commit comments