Skip to content

Commit a03c83f

Browse files
committed
Port existing modules
1 parent 715f9ea commit a03c83f

File tree

9 files changed

+583
-21
lines changed

9 files changed

+583
-21
lines changed

src/BioFSharp.Stats/BioFSharp.Stats.fsproj

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
</PropertyGroup>
77

88
<ItemGroup>
9-
<Compile Include="Library.fs" />
9+
<Compile Include="PhylTree.fs" />
10+
<Compile Include="OntologyEnrichment.fs" />
11+
<Compile Include="RNASeq.fs" />
12+
<Compile Include="SAEPT.fs" />
1013
</ItemGroup>
1114

1215
<ItemGroup>
1316
<PackageReference Include="BioFSharp" Version="2.0.0-preview.3" />
17+
<PackageReference Include="FSharp.Stats" Version="0.4.11" />
1418
</ItemGroup>
1519

1620
</Project>

src/BioFSharp.Stats/Library.fs

Lines changed: 0 additions & 8 deletions
This file was deleted.
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
namespace BioFSharp.Stats
2+
3+
open System
4+
open FSharpAux
5+
6+
module OntologyEnrichment =
7+
8+
/// Represents an item in an ontology set
9+
type OntologyItem<'a> = {
10+
Id : string
11+
OntologyTerm : string
12+
GroupIndex : int
13+
Item : 'a
14+
}
15+
16+
17+
/// Creates an item in an ontology set
18+
let createOntologyItem id ontologyTerm groupIndex item =
19+
{Id = id; OntologyTerm = ontologyTerm; GroupIndex = groupIndex; Item = item}
20+
21+
/// Represents a gene set enrichment result
22+
type GseaResult<'a> = {
23+
///Ontology term e.g. MapMan term, GO term ...
24+
OntologyTerm : string
25+
///Sequence of single items associated to the ontology term
26+
ItemsInBin : seq<OntologyItem<'a>>
27+
///Number of significantly altered items in 'OntologyTerm' bin
28+
NumberOfDEsInBin : int
29+
///Number of items in 'OntologyTerm' bin
30+
NumberInBin : int
31+
///Number of significantly altered items within the total data set
32+
TotalNumberOfDE : int
33+
///Number of all items (expanded)
34+
TotalUniverse : int
35+
PValue : float
36+
}
37+
38+
/// Creates a gene set enrichment result
39+
let createGseaResult ontologyTerm desInBin numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue =
40+
{OntologyTerm = ontologyTerm;ItemsInBin = desInBin; NumberOfDEsInBin = numberOfDEsInBin;
41+
NumberInBin = numberInBin; TotalNumberOfDE = totalNumberOfDE; TotalUniverse = totalUnivers; PValue = pValue}
42+
43+
///Splits an OntologyEntry with seperator concatenated TermIds
44+
let splitMultipleAnnotationsBy (separator:char) (item:OntologyItem<'A>) =
45+
let annotations = item.OntologyTerm.Split(separator)
46+
annotations
47+
|> Seq.map (fun ot -> {item with OntologyTerm = ot})
48+
49+
/// Splits MapMan OntologyEntries with seperator concatenated TermIds
50+
/// Attention: Also parses string to int to get rid of 0 - terms
51+
let splitMapManOntologyItemsBy (separator:char) (data:seq<OntologyItem<'a>>) =
52+
let splitTerm (termId:string) (separator:char) =
53+
termId.Split(separator)
54+
|> Array.map (fun sTerm ->
55+
let splited = sTerm.Split('.')
56+
let toInt = splited |> Seq.map (fun v -> Int32.Parse(v).ToString())
57+
toInt |> String.concat "."
58+
)
59+
data
60+
|> Seq.collect (fun oi ->
61+
splitTerm oi.OntologyTerm separator
62+
|> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item)
63+
)
64+
65+
66+
/// Extends leaf OntologyEntries to their full tree
67+
let expandOntologyTree (data:seq<OntologyItem<'a>>) =
68+
data
69+
|> Seq.collect (fun oi ->
70+
let expandenTermIds = oi.OntologyTerm.Split('.') |> Array.scanReduce (fun acc elem -> acc + "." + elem)
71+
expandenTermIds |> Seq.map (fun sTerm -> createOntologyItem oi.Id sTerm oi.GroupIndex oi.Item)
72+
)
73+
74+
75+
// ###########################################################################################################
76+
// the hypergeometric distribution is a discrete probability distribution that describes the probability of
77+
// k successes in
78+
// n draws from a finite
79+
// x population of size containing
80+
// m successes without replacement (successes states)
81+
/// Calculates p value based on hypergeometric distribution (pValue <= k)
82+
let CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE (splitPvalueThreshold:int) =
83+
if (numberOfDEsInBin > 1) then
84+
let hp = FSharp.Stats.Distributions.Discrete.Hypergeometric.Init totalUnivers totalNumberOfDE numberInBin
85+
if numberInBin > splitPvalueThreshold then
86+
// Calculate normal pValue
87+
1. - hp.CDF (float (numberOfDEsInBin - 1))
88+
else
89+
// Calculate split pValue
90+
0.5 * ((1. - hp.CDF(float(numberOfDEsInBin - 1)) ) + ( (1. - hp.CDF(float(numberOfDEsInBin))) ) )
91+
else
92+
nan
93+
94+
95+
// #######################################################
96+
// functional term enrichment is calculated according to following publication
97+
// http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401
98+
// also includes mid-pValues
99+
/// Calculates functional term enrichment
100+
let CalcSimpleOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option<int>) (data:seq<OntologyItem<'a>>) =
101+
let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5
102+
103+
let totalUnivers = data |> Seq.length
104+
let totalNumberOfDE = data |> Seq.filter (fun oi -> oi.GroupIndex = deGroupIndex) |> Seq.length
105+
106+
// returns (DE count, all count)
107+
let countDE (subSet:seq<OntologyItem<'a>>) =
108+
let countMap =
109+
subSet
110+
|> Seq.countBy (fun oi -> if oi.GroupIndex = deGroupIndex then true else false )
111+
|> Map.ofSeq
112+
(countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false))
113+
114+
data
115+
|> Seq.groupBy ( fun oi -> oi.OntologyTerm)
116+
|> Seq.map (fun (oTerm,values) ->
117+
let numberOfDEsInBin,numberInBin = countDE values
118+
let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold
119+
createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue)
120+
121+
122+
// #######################################################
123+
// functional term enrichment is calculated according to following publication
124+
// http://bioinformatics.oxfordjournals.org/cgi/content/abstract/23/4/401
125+
// also includes mid-pValues
126+
/// Calculates functional term enrichment
127+
let CalcOverEnrichment (deGroupIndex:int) (splitPvalueThreshold:option<int>) (minNumberInTerm:option<int>) (data:seq<OntologyItem<'a>>) =
128+
let _splitPvalueThreshold = defaultArg splitPvalueThreshold 5
129+
let _minNumberInTerm = defaultArg minNumberInTerm 2
130+
131+
// Distinct by term and gene name
132+
// Has to be done by an ouside function
133+
//let distinctData = data |> Seq.distinctBy (fun o -> o.displayID)
134+
let gData = data |> Seq.groupBy ( fun o -> o.OntologyTerm)
135+
// reduce to terms at least annotated with 2 items
136+
let fData = gData |> Seq.filter ( fun (key:string,values:seq<OntologyItem<'a>>) -> Seq.length(values) >= _minNumberInTerm)
137+
let groupCount = fData |> Seq.collect (fun (key:string,values:seq<OntologyItem<'a>>) -> values ) |> Seq.countBy (fun o -> o.GroupIndex)
138+
139+
let totalUnivers = groupCount |> Seq.fold (fun (acc:int) (index:int,count:int) -> acc + count) 0
140+
let totalNumberOfDE =
141+
let tmp = groupCount |> Seq.tryFind (fun (key,v) -> key = deGroupIndex)
142+
if tmp.IsNone then
143+
raise (System.ArgumentException("DE group index does not exists in ontology entry"))
144+
else
145+
snd(tmp.Value)
146+
147+
// returns (DE count, all count)
148+
let countDE (subSet:seq<OntologyItem<'a>>) =
149+
let countMap =
150+
subSet
151+
|> Seq.countBy (fun (oi) -> oi.GroupIndex = deGroupIndex)
152+
|> Map.ofSeq
153+
(countMap.TryFindDefault 0 true,(countMap.TryFindDefault 0 true) + (countMap.TryFindDefault 0 false))
154+
155+
fData
156+
|> Seq.map (fun (oTerm,values) ->
157+
let numberOfDEsInBin,numberInBin = countDE values
158+
let pValue = CalcHyperGeoPvalue numberOfDEsInBin numberInBin totalUnivers totalNumberOfDE _splitPvalueThreshold
159+
createGseaResult oTerm values numberOfDEsInBin numberInBin totalNumberOfDE totalUnivers pValue)
160+
161+

src/BioFSharp.Stats/PhylTree.fs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
namespace BioFSharp.Stats
2+
3+
open BioFSharp
4+
open FSharp.Stats.ML.Unsupervised
5+
6+
module PhylTree =
7+
8+
/// <summary>
9+
/// converts the input hierarchical clustering to a phylogenetig tree and conserves the distance insformation.
10+
/// In contrast to the clustering result, the distance value of a Branch represents the distance to its Parent,
11+
/// not the distance that all children have to this Branch.
12+
/// </summary>
13+
let ofHierarchicalCluster (branchTag:'T) (distanceConverter: float -> 'Distance) (hCluster:HierarchicalClustering.Cluster<'T>) : PhylogeneticTree<'T * 'Distance>=
14+
let rec loop distance (c: HierarchicalClustering.Cluster<'T>) =
15+
match c with
16+
| HierarchicalClustering.Cluster.Node (cIndex, distance, lCount, left, right) ->
17+
PhylogeneticTree.Branch ((branchTag, distanceConverter distance), [loop distance left; loop distance right])
18+
| HierarchicalClustering.Cluster.Leaf (id, lCount, tag) -> PhylogeneticTree.Leaf((tag, distanceConverter distance))
19+
loop 0. hCluster
20+
21+
/// <summary>
22+
/// Performs hierarchical clustering of the input TaggedSequences using the provided distance function and linker. Returns the result as a Phylogenetic tree.</summary>
23+
/// </summary>
24+
/// <parameter name="branchTag">a tag to give the infered common ancestor branches (these are not tagged in contrast to the input sequence.)</parameter>
25+
/// <parameter name="distanceConverter">a converter function for the distance between nodes of the tree. Usually, a conversion to a string makes sense for downstream conversion to Newick format</parameter>
26+
/// <parameter name="distanceFunction">a function that determines the distance between two sequences e.g. evolutionary distance based on a substitution model</parameter>
27+
/// <parameter name="linker">the linker function to join clusters with</parameter>
28+
/// <parameter name="sequences">the input TaggedSequences</parameter>
29+
let ofTaggedSequencesWithLinker (branchTag:'T) (distanceConverter: float -> 'Distance) (distanceFunction: seq<'S> -> seq<'S> -> float) linker (sequences: seq<TaggedSequence<'T,'S>>) =
30+
sequences
31+
|> HierarchicalClustering.generate
32+
(fun a b -> distanceFunction a.Sequence b.Sequence)
33+
linker
34+
|> ofHierarchicalCluster (TaggedSequence.create branchTag Seq.empty) distanceConverter
35+
36+
37+
/// <summary>
38+
/// Performs hierarchical clustering of the input TaggedSequences using the provided distance function. Returns the result as a Phylogenetic tree.</summary>
39+
/// </summary>
40+
/// <parameter name="distanceFunction">a function that determines the distance between two sequences e.g. evolutionary distance based on a substitution model</parameter>
41+
/// <parameter name="sequences">the input TaggedSequences</parameter>
42+
let ofTaggedBioSequences (distanceFunction: seq<#IBioItem> -> seq<#IBioItem> -> float) (sequences: seq<TaggedSequence<string,#IBioItem>>) : PhylogeneticTree<TaggedSequence<string,#IBioItem>*float>=
43+
sequences
44+
|> ofTaggedSequencesWithLinker
45+
"Ancestor"
46+
id
47+
distanceFunction
48+
HierarchicalClustering.Linker.upgmaLwLinker

src/BioFSharp.Stats/RNASeq.fs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
namespace BioFSharp.Stats
2+
3+
/// Contains types and functions needed for RNA-Seq normalization
4+
module RNASeq =
5+
6+
/// Input type for RNA-Seq normalization
7+
type RNASeqInput = {
8+
GeneID : string
9+
GeneLength : float
10+
GeneCount : float
11+
} with
12+
static member Create(
13+
id: string,
14+
gl: float,
15+
gc: float
16+
) = { GeneID=id; GeneLength=gl; GeneCount=gc}
17+
18+
type NormalizationMethod =
19+
| RPKM
20+
| TPM
21+
22+
/// Type with GeneID, normalized data and method of normalization
23+
type NormalizedCounts = {
24+
GeneID : string
25+
NormalizedCount : float
26+
NormalizationMethod: NormalizationMethod
27+
} with
28+
static member Create(
29+
id: string,
30+
nc: float,
31+
nm: NormalizationMethod
32+
) = { GeneID=id; NormalizedCount=nc; NormalizationMethod=nm}
33+
34+
/// calculates Reads Per Million
35+
let private calcRPM sumOfAllReadsPerMil counts =
36+
(counts |> float) / sumOfAllReadsPerMil
37+
38+
/// calculates RPKM
39+
let private calcRPKM geneLength rpm =
40+
(float rpm) / ((float geneLength) / 1000.)
41+
42+
///Performs RPKM normalization
43+
let private rpkmsOf (geneIDs:seq<string>) (length:seq<float>) (counts:seq<float>) =
44+
45+
let sumOfAllReads = counts |> Seq.sum
46+
let sumOfAllReadsPerMil = sumOfAllReads / 1000000.
47+
let rpms = Seq.map (fun counts -> calcRPM sumOfAllReadsPerMil counts) counts
48+
49+
let rpkms =
50+
Seq.zip length rpms
51+
|> Seq.map (fun (length, rpm) -> calcRPKM length rpm)
52+
53+
let rpkmResult =
54+
Seq.map2 (fun ids counts -> NormalizedCounts.Create(ids, counts, RPKM)) geneIDs rpkms
55+
56+
rpkmResult
57+
58+
/// Returns RPKM normalized data
59+
let rpkms (idLengthAndCounts:seq<RNASeqInput>) =
60+
rpkmsOf
61+
(idLengthAndCounts |> Seq.map (fun x -> x.GeneID))
62+
(idLengthAndCounts |> Seq.map (fun x -> x.GeneLength))
63+
(idLengthAndCounts |> Seq.map (fun x -> x.GeneCount))
64+
65+
/// Performs TPM normalization
66+
let private tpmsOf (idLengthAndCounts:seq<RNASeqInput>) =
67+
let rpk =
68+
idLengthAndCounts
69+
|> Seq.map (fun idLengthAndCounts -> idLengthAndCounts.GeneCount/idLengthAndCounts.GeneLength/1000.)
70+
71+
let sumOfAllReads = rpk |> Seq.sum
72+
let sumOfAllReadsPerMil = sumOfAllReads / 1000000.
73+
let tpms = rpk |> Seq.map (fun rpks -> rpks/sumOfAllReadsPerMil)
74+
75+
let geneID =
76+
idLengthAndCounts
77+
|> Seq.map (fun idLengthAndCounts -> idLengthAndCounts.GeneID)
78+
79+
let tpmResult =
80+
Seq.map2 (fun ids counts -> NormalizedCounts.Create(ids, counts, TPM)) geneID tpms
81+
82+
tpmResult
83+
84+
/// Returns TPM normalized data
85+
let tpms (idLengthAndCounts:seq<RNASeqInput>) =
86+
tpmsOf idLengthAndCounts
87+

0 commit comments

Comments
 (0)