Skip to content

Commit

Permalink
Merge pull request #301 from poseidon-framework/orderedForge
Browse files Browse the repository at this point in the history
Ordered forge
  • Loading branch information
stschiff authored Jun 10, 2024
2 parents c6a765e + 9dcfec9 commit a141fc3
Show file tree
Hide file tree
Showing 17 changed files with 205 additions and 68 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
- V 1.5.2.0:
- A new option `forge --ordered` was added, which outputs the resulting package with individuals ordered according to the entered entities.
- V 1.5.1.0:
- A new option `list --individuals --fullJanno` adds all standard columns from the Janno to the per-individual output.
- A new API option `/individuals?additionalJannoColumns=ALL` triggers the same behaviour for the Web API.
Expand Down
2 changes: 1 addition & 1 deletion poseidon-hs.cabal
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: poseidon-hs
version: 1.5.1.0
version: 1.5.2.0
synopsis: A package with tools for working with Poseidon Genotype Data
description: The tools in this package read and analyse Poseidon-formatted genotype databases, a modular system for storing genotype data from thousands of individuals.
license: MIT
Expand Down
1 change: 1 addition & 0 deletions src-executables/Main-trident.hs
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ forgeOptParser = ForgeOptions <$> parseGenoDataSources
<*> parseMaybeOutPackageName
<*> parsePackageWise
<*> parseOutputPlinkPopMode
<*> parseOutputOrdered

genoconvertOptParser :: OP.Parser GenoconvertOptions
genoconvertOptParser = GenoconvertOptions <$> parseGenoDataSources
Expand Down
4 changes: 3 additions & 1 deletion src/Poseidon/CLI/Forge.hs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ data ForgeOptions = ForgeOptions
, _forgeOutPacName :: Maybe String
, _forgePackageWise :: Bool
, _forgeOutputPlinkPopMode :: PlinkPopNameMode
, _forgeOutputOrdered :: Bool
}

pacReadOpts :: PackageReadOptions
Expand All @@ -93,6 +94,7 @@ runForge (
entityInputs maybeSnpFile intersect_
outFormat minimal onlyGeno outPathRaw maybeOutName
packageWise outPlinkPopMode
outputOrdered
) = do

-- load packages --
Expand Down Expand Up @@ -137,7 +139,7 @@ runForge (
else entities

-- determine indizes of relevant individuals
relevantIndices <- resolveUniqueEntityIndices relevantEntities indInfoCollection
relevantIndices <- resolveUniqueEntityIndices outputOrdered relevantEntities indInfoCollection

-- collect data --
-- janno
Expand Down
2 changes: 2 additions & 0 deletions src/Poseidon/CLI/OptparseApplicativeParsers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -848,3 +848,5 @@ parseJannocoalIdStripRegex = OP.option (Just <$> OP.str) (
OP.value Nothing
)

parseOutputOrdered :: OP.Parser Bool
parseOutputOrdered = OP.switch (OP.long "ordered" <> OP.help "With this option, the output of forge is ordered according to the entities given.")
47 changes: 34 additions & 13 deletions src/Poseidon/EntityTypes.hs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ import Data.Aeson (FromJSON (..), ToJSON (..), Value (..),
withText)
import Data.Aeson.Types (Parser)
import Data.Char (isSpace)
import Data.List (groupBy, intercalate, nub, sortOn)
import Data.List (groupBy, intercalate, nub, sortOn,
(\\))
import Data.Maybe (isJust, isNothing, mapMaybe)
import Data.Text (Text, pack, unpack)
import Data.Version (Version, showVersion)
Expand Down Expand Up @@ -263,15 +264,35 @@ reportDuplicateIndividuals individuals = do -- loop over duplication groups
duplicateGroup@(firstInd : _) <- filter ((>1) . length) . groupBy (\a b -> indInfoName a == indInfoName b) . sortOn indInfoName $ individuals
return (firstInd, [SpecificInd n' (head g) p | IndividualInfo n' g p <- duplicateGroup])

resolveEntityIndices :: (EntitySpec a) => Bool -> [a] -> IndividualInfoCollection -> [Int]
resolveEntityIndices True = resolveEntityIndicesOrdered
resolveEntityIndices False = resolveEntityIndicesUnordered

-- | this finds the indices of all individuals from an individual-list which are specified in the Entity list
resolveEntityIndices :: (MonadThrow m, EntitySpec a) => [a] -> IndividualInfoCollection -> m [Int]
resolveEntityIndices entities (indInfos, areLatest) = do
let relevantIndizes = [ i | (i, ind, l) <- zip3 [0..] indInfos areLatest, indInfoConformsToEntitySpecs ind l entities ]
return relevantIndizes

resolveUniqueEntityIndices :: (EntitySpec a) => [a] -> IndividualInfoCollection -> PoseidonIO [Int]
resolveUniqueEntityIndices entities indInfoCollection = do
relevantIndices <- resolveEntityIndices entities indInfoCollection
resolveEntityIndicesUnordered :: (EntitySpec a) => [a] -> IndividualInfoCollection -> [Int]
resolveEntityIndicesUnordered entities (indInfos, areLatest) =
[ i | (i, ind, l) <- zip3 [0..] indInfos areLatest, indInfoConformsToEntitySpecs ind l entities ]

-- | this finds the indices of all individuals from an individual-list which are specified in the Entity list, ordered by the entity list
resolveEntityIndicesOrdered :: (EntitySpec a) => [a] -> IndividualInfoCollection -> [Int]
resolveEntityIndicesOrdered entities (indInfos, areLatest) = go [] entities
where
go :: (EntitySpec a) => [Int] -> [a] -> [Int]
go selectedIndices [] = selectedIndices
go selectedIndices (entity:restEntities)=
-- We first check whether any already selected indices are removed due to the new entity (can happen if the entity is signed and negative)
let selectedInds = map (indInfos!!) selectedIndices
selectedLatest = map (areLatest!!) selectedIndices
selectedUpdated = [i | (i, ind, l) <- zip3 selectedIndices selectedInds selectedLatest, indInfoConformsToEntitySpec ind l entity /= Just False]
-- We then check which indices are found according to the new entity...
additionalIndicesAll = [ i | (i, ind, l) <- zip3 [0..] indInfos areLatest, indInfoConformsToEntitySpec ind l entity == Just True]
-- ... and use only the ones that are not already selected:
additionalIndicesNew = additionalIndicesAll \\ selectedUpdated
in go (selectedUpdated ++ additionalIndicesNew) restEntities

resolveUniqueEntityIndices :: (EntitySpec a) => Bool -> [a] -> IndividualInfoCollection -> PoseidonIO [Int]
resolveUniqueEntityIndices isOrdered entities indInfoCollection = do
let relevantIndices = resolveEntityIndices isOrdered entities indInfoCollection
let duplicateReport = reportDuplicateIndividuals . map ((fst indInfoCollection) !!) $ relevantIndices
-- check if there still are duplicates and if yes, then stop
unless (null duplicateReport) $ do
Expand All @@ -285,13 +306,13 @@ resolveUniqueEntityIndices entities indInfoCollection = do
return relevantIndices

-- | this returns a list of entities which could not be found
determineNonExistentEntities :: (MonadThrow m, EntitySpec a) => [a] -> IndividualInfoCollection -> m EntitiesList
determineNonExistentEntities entities indInfoCollection = do
return [ entity | entity <- map underlyingEntity entities, indices <- resolveEntityIndices [entity] indInfoCollection, null indices]
determineNonExistentEntities :: (EntitySpec a) => [a] -> IndividualInfoCollection -> EntitiesList
determineNonExistentEntities entities indInfoCollection =
[ entity | entity <- map underlyingEntity entities, null (resolveEntityIndices False [entity] indInfoCollection)]

checkIfAllEntitiesExist :: (EntitySpec a) => [a] -> IndividualInfoCollection -> PoseidonIO ()
checkIfAllEntitiesExist entities indInfoCollection = do
nonExistentEntities <- determineNonExistentEntities entities indInfoCollection
let nonExistentEntities = determineNonExistentEntities entities indInfoCollection
unless (null nonExistentEntities) $ do
logError "The following entities could not be found in the dataset"
forM_ nonExistentEntities (logError . show)
Expand Down
112 changes: 66 additions & 46 deletions test/Poseidon/EntitiesListSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,11 @@ testFindNonExistentEntities =
describe "Poseidon.EntitiesList.determineNonExistentEntities" $ do
it "should ignore good entities" $ do
ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir
ents <- determineNonExistentEntities goodEntities =<< getJointIndividualInfo ps
ents <- determineNonExistentEntities goodEntities <$> getJointIndividualInfo ps
ents `shouldBe` []
it "should find bad entities" $ do
ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir
ents <- determineNonExistentEntities badEntities =<< getJointIndividualInfo ps
ents <- determineNonExistentEntities badEntities <$> getJointIndividualInfo ps
ents `shouldMatchList` badEntities

testFilterPackages :: Spec
Expand All @@ -233,11 +233,11 @@ testResolveEntityIndices =
describe "Poseidon.EntitiesList.resolveEntityIndices" $ do
it "should select all relevant individuals" $ do
ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir
indInts <- resolveEntityIndices goodEntities =<< getJointIndividualInfo ps
let indInts = resolveEntityIndices False goodEntities =<< getJointIndividualInfo ps
indInts `shouldBe` [10,11,12,16,18,20,21,22,23,24,25,26,27,28,29,35]
it "should drop all irrelevant individuals" $ do
ps <- testLog $ readPoseidonPackageCollection testPacReadOpts testBaseDir
indInts <- resolveEntityIndices badEntities =<< getJointIndividualInfo ps
let indInts = resolveEntityIndices False badEntities =<< getJointIndividualInfo ps
indInts `shouldBe` []
it "should correctly extract indices with ordered signed entities" $ do
let indInfo = [
Expand All @@ -252,16 +252,16 @@ testResolveEntityIndices =
]
areLatest <- mapM (isLatestInCollection indInfo) indInfo
let indInfoCollection = (indInfo, areLatest)
indInts1 <- resolveEntityIndices [
Include (Pac (PacNameAndVersion "Pac1" Nothing))
] indInfoCollection
let indInts1 = resolveEntityIndices False [
Include (Pac (PacNameAndVersion "Pac1" Nothing))
] indInfoCollection
indInts1 `shouldBe` [0, 1, 2, 3]
indInts2 <- resolveEntityIndices [
Include (Pac (PacNameAndVersion "Pac1" Nothing))
, Exclude (Group "Pop2")
, Include (Ind "Ind3")
, Include (SpecificInd "Ind8" "Pop4" (PacNameAndVersion "Pac2" Nothing))
] indInfoCollection
let indInts2 = resolveEntityIndices False [
Include (Pac (PacNameAndVersion "Pac1" Nothing))
, Exclude (Group "Pop2")
, Include (Ind "Ind3")
, Include (SpecificInd "Ind8" "Pop4" (PacNameAndVersion "Pac2" Nothing))
] indInfoCollection
indInts2 `shouldBe` [0, 1, 2, 7]
it "should correctly extract indices in case of duplicates across packages" $ do
let indInfoDuplicates = [
Expand All @@ -274,19 +274,19 @@ testResolveEntityIndices =
areLatest <- mapM (isLatestInCollection indInfoDuplicates) indInfoDuplicates
let indInfoDupCollection = (indInfoDuplicates, areLatest)
-- test simple extraction with specific syntax
indInts1 <- resolveEntityIndices [
Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac2" Nothing))
] indInfoDupCollection
let indInts1 = resolveEntityIndices False [
Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac2" Nothing))
] indInfoDupCollection
indInts1 `shouldBe` [1]
-- test solving simple duplication for one individual
indInts2 <- resolveEntityIndices [
Include $ Ind "Ind1"
, Exclude $ Pac (PacNameAndVersion "Pac2" Nothing)
, Exclude $ Pac (PacNameAndVersion "Pac3" Nothing)
] indInfoDupCollection
let indInts2 = resolveEntityIndices False [
Include $ Ind "Ind1"
, Exclude $ Pac (PacNameAndVersion "Pac2" Nothing)
, Exclude $ Pac (PacNameAndVersion "Pac3" Nothing)
] indInfoDupCollection
indInts2 `shouldBe` [0]
-- test output in case of unresolved duplicates
indInts3 <- resolveEntityIndices [Include (Ind "Ind2")] indInfoDupCollection
let indInts3 = resolveEntityIndices False [Include (Ind "Ind2")] indInfoDupCollection
indInts3 `shouldBe` [3, 4, 5]
let duplicateReport = reportDuplicateIndividuals . map (indInfoDuplicates !!) $ indInts3
duplicateReport `shouldBe` [
Expand All @@ -296,12 +296,12 @@ testResolveEntityIndices =
SpecificInd "Ind2" "Pop2" (PacNameAndVersion "Pac3" Nothing)])
]
-- test interaction with secondary group name selection and negative selection to solve duplication
indInts4 <- resolveEntityIndices [
Include $ Group "PopB"
, Exclude $ Group "Pop2"
, Exclude $ Pac (PacNameAndVersion "Pac2" Nothing)
, Exclude $ Pac (PacNameAndVersion "Pac3" Nothing)
] indInfoDupCollection
let indInts4 = resolveEntityIndices False [
Include $ Group "PopB"
, Exclude $ Group "Pop2"
, Exclude $ Pac (PacNameAndVersion "Pac2" Nothing)
, Exclude $ Pac (PacNameAndVersion "Pac3" Nothing)
] indInfoDupCollection
indInts4 `shouldBe` [0]
it "should correctly extract indices in case of multiple package versions" $ do
let indInfo = [
Expand All @@ -314,29 +314,49 @@ testResolveEntityIndices =
]
areLatest <- mapM (isLatestInCollection indInfo) indInfo
let indInfoCollection = (indInfo, areLatest)
indInts1 <- resolveEntityIndices [Include $ Pac (PacNameAndVersion "Pac1" Nothing)] indInfoCollection
let indInts1 = resolveEntityIndices False [Include $ Pac (PacNameAndVersion "Pac1" Nothing)] indInfoCollection
indInts1 `shouldBe` [2, 3]
indInts2 <- resolveEntityIndices [
Include (Pac (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
] indInfoCollection
let indInts2 = resolveEntityIndices False [
Include (Pac (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
] indInfoCollection
indInts2 `shouldBe` [0,1]
indInts3 <- resolveEntityIndices [
Include (Pac (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0])))
] indInfoCollection
let indInts3 = resolveEntityIndices False [
Include (Pac (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0])))
] indInfoCollection
indInts3 `shouldBe` [2,3]
indInts4 <- resolveEntityIndices [
Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0])))
, Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
, Exclude (Pac (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
] indInfoCollection
let indInts4 = resolveEntityIndices False [
Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0])))
, Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
, Exclude (Pac (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
] indInfoCollection
indInts4 `shouldBe` [2]
indInts5 <- resolveEntityIndices [
Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0])))
, Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
, Exclude (Pac (PacNameAndVersion "Pac1" Nothing))
, Include (Ind "Ind4")
] indInfoCollection
let indInts5 = resolveEntityIndices False [
Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0])))
, Include (SpecificInd "Ind1" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0])))
, Exclude (Pac (PacNameAndVersion "Pac1" Nothing))
, Include (Ind "Ind4")
] indInfoCollection
indInts5 `shouldBe` [5]
it "should correctly respect the order in case of ordered resolve" $ do
let indInfo = [
IndividualInfo "Ind1" ["Pop1", "PopB"] (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0]))
, IndividualInfo "Ind2" ["Pop1", "PopB"] (PacNameAndVersion "Pac1" (Just $ makeVersion [1,0,0]))
, IndividualInfo "Ind1" ["Pop1", "PopB"] (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0]))
, IndividualInfo "Ind2" ["Pop1", "PopB"] (PacNameAndVersion "Pac1" (Just $ makeVersion [2,0,0]))
, IndividualInfo "Ind3" ["Pop3", "PopC"] (PacNameAndVersion "Pac2" Nothing)
, IndividualInfo "Ind4" ["Pop3", "PopC"] (PacNameAndVersion "Pac2" Nothing)
]
areLatest <- mapM (isLatestInCollection indInfo) indInfo
let indInfoCollection = (indInfo, areLatest)
let entities = [
Include (Ind "Ind1")
, Include (Group "Pop3")
, Exclude (Ind "Ind3")
, Include (SpecificInd "Ind2" "Pop1" (PacNameAndVersion "Pac1" (Just $ makeVersion [1, 0, 0])))
]
resolveEntityIndices False entities indInfoCollection `shouldBe` [1, 2, 5]
resolveEntityIndices True entities indInfoCollection `shouldBe` [2, 5, 1]


testShow :: Spec
testShow =
Expand Down
3 changes: 3 additions & 0 deletions test/PoseidonGoldenTests/GoldenTestCheckSumFile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ d2d112d9de2e45a113ac206f64a0667e forge forge/ForgePac13/ForgePac13.janno
c3ea6ad176514659f44ffb71291117d0 forge forge/ForgePac15/ForgePac15.janno
c3ea6ad176514659f44ffb71291117d0 forge forge/ForgePac16/ForgePac16.janno
4e3c2fcf7c1f3b2de916666c3abdd53c forge forge/ForgePac17/ForgePac17.janno
8ad890117500b83c206729a8126d8a51 forge forge/ForgePac18/ForgePac18.janno
3fc77f5a5b83ac4fc7b082c9b3ee4ba1 forge forge/ForgePac18/ForgePac18.fam
842885ffe256819b264991384020c2ac forge forge/ForgePac18/ForgePac18.bed
d4a05cfef045648238a94a9d621cf667 chronicle chronicle/chronicle1.yml
b43da4d5734371c0648553120f812466 timetravel timetravel/Lamnidis_2018-1.0.0/POSEIDON.yml
8d57ce1a1ab28c0d8a5f391dd790a59c timetravel timetravel/Lamnidis_2018-1.0.1/POSEIDON.yml
Expand Down
14 changes: 7 additions & 7 deletions test/PoseidonGoldenTests/GoldenTestData/chronicle/chronicle2.yml
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
title: Chronicle title
description: Chronicle description
chronicleVersion: 0.2.0
lastModified: 2024-04-02
lastModified: 2024-06-03
packages:
- title: Lamnidis_2018
version: 1.0.0
commit: dbec249fe4197c78502ed0cecd2d7fff855e6463
commit: bd7522f6a38ae1683be69e66ac33ffc79ca7737f
path: Lamnidis_2018
- title: Lamnidis_2018
version: 1.0.1
commit: dbec249fe4197c78502ed0cecd2d7fff855e6463
commit: bd7522f6a38ae1683be69e66ac33ffc79ca7737f
path: Lamnidis_2018_newVersion
- title: Schiffels
version: 1.1.1
commit: 585c055f40836db0fbde0267cd6e8c472f8f6ff3
commit: b03d74fd3bc286b10fd9edfdedeec1d35fc21e24
path: Schiffels
- title: Schiffels_2016
version: 1.0.1
commit: dbec249fe4197c78502ed0cecd2d7fff855e6463
commit: bd7522f6a38ae1683be69e66ac33ffc79ca7737f
path: Schiffels_2016
- title: Schmid_2028
version: 1.0.0
commit: dbec249fe4197c78502ed0cecd2d7fff855e6463
commit: bd7522f6a38ae1683be69e66ac33ffc79ca7737f
path: Schmid_2028
- title: Wang_2020
version: 0.1.0
commit: dbec249fe4197c78502ed0cecd2d7fff855e6463
commit: bd7522f6a38ae1683be69e66ac33ffc79ca7737f
path: Wang_2020
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
@article{Schiffels2016,
title = {Test},
}

@book{TestBook1,
title = {TestBook},
}

@article{TestPaper1,
title = {TestPaper},
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
1 1_752566 2.013e-2 752566 G A
1 1_842013 2.2518e-2 842013 T G
1 1_891021 2.4116e-2 891021 G A
1 1_949654 2.5727e-2 949654 A G
2 2_1018704 2.6288e-2 1018704 A G
2 2_1045331 2.6665e-2 1045331 G A
2 2_1048955 2.6674e-2 1048955 A G
2 2_1061166 2.6711e-2 1061166 T C
2 2_1108637 2.8311e-2 1108637 G A
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
POP3 XXX008 0 0 2 0
POP3 XXX010 0 0 1 0
POP2 XXX004 0 0 2 0
POP2 XXX006 0 0 2 0
POP1 XXX003 0 0 1 0
Loading

0 comments on commit a141fc3

Please sign in to comment.