diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 0000000000000..838d5de99846c --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,1718 @@ +{ + "files": [ + "README.md" + ], + "imageSize": 50, + "badgeTemplate": "", + "commit": false, + "contributors": [ + { + "login": "timgl", + "name": "timgl", + "avatar_url": "https://avatars.githubusercontent.com/u/1727427?v=4", + "profile": "https://github.com/timgl", + "contributions": [ + "code" + ] + }, + { + "login": "mariusandra", + "name": "mariusandra", + "avatar_url": "https://avatars.githubusercontent.com/u/53387?v=4", + "profile": "https://github.com/mariusandra", + "contributions": [ + "code" + ] + }, + { + "login": "EDsCODE", + "name": "EDsCODE", + "avatar_url": "https://avatars.githubusercontent.com/u/13127476?v=4", + "profile": "https://github.com/EDsCODE", + "contributions": [ + "code" + ] + }, + { + "login": "Twixes", + "name": "Twixes", + "avatar_url": "https://avatars.githubusercontent.com/u/4550621?v=4", + "profile": "https://github.com/Twixes", + "contributions": [ + "code", + "test", + "design" + ] + }, + { + "login": "macobo", + "name": "macobo", + "avatar_url": "https://avatars.githubusercontent.com/u/148820?v=4", + "profile": "https://github.com/macobo", + "contributions": [ + "code" + ] + }, + { + "login": "paolodamico", + "name": "paolodamico", + "avatar_url": "https://avatars.githubusercontent.com/u/5864173?v=4", + "profile": "https://github.com/paolodamico", + "contributions": [ + "code", + "test" + ] + }, + { + "login": "fuziontech", + "name": "fuziontech", + "avatar_url": "https://avatars.githubusercontent.com/u/391319?v=4", + "profile": "https://github.com/fuziontech", + "contributions": [ + "code" + ] + }, + { + "login": "yakkomajuri", + "name": "yakkomajuri", + "avatar_url": "https://avatars.githubusercontent.com/u/38760734?v=4", + "profile": "https://github.com/yakkomajuri", + "contributions": [ + "code", + "bug" + ] + }, + { + "login": "jamesefhawkins", + "name": "jamesefhawkins", + "avatar_url": "https://avatars.githubusercontent.com/u/47497682?v=4", + "profile": "https://github.com/jamesefhawkins", + "contributions": [ + "code" + ] + }, + { + "login": "posthog-bot", + "name": "posthog-bot", + "avatar_url": "https://avatars.githubusercontent.com/u/69588470?v=4", + "profile": "https://github.com/posthog-bot", + "contributions": [ + "code" + ] + }, + { + "login": "dependabot-preview[bot]", + "name": "dependabot-preview[bot]", + "avatar_url": "https://avatars.githubusercontent.com/in/2141?v=4", + "profile": "https://github.com/apps/dependabot-preview", + "contributions": [ + "code" + ] + }, + { + "login": "bhavish-agarwal", + "name": "bhavish-agarwal", + "avatar_url": "https://avatars.githubusercontent.com/u/14195048?v=4", + "profile": "https://github.com/bhavish-agarwal", + "contributions": [ + "code", + "test", + "bug" + ] + }, + { + "login": "Tannergoods", + "name": "Tannergoods", + "avatar_url": "https://avatars.githubusercontent.com/u/60791437?v=4", + "profile": "https://github.com/Tannergoods", + "contributions": [ + "code" + ] + }, + { + "login": "ungless", + "name": "ungless", + "avatar_url": "https://avatars.githubusercontent.com/u/8397061?v=4", + "profile": "https://github.com/ungless", + "contributions": [ + "code" + ] + }, + { + "login": "dependabot[bot]", + "name": "dependabot[bot]", + "avatar_url": "https://avatars.githubusercontent.com/in/29110?v=4", + "profile": "https://github.com/apps/dependabot", + "contributions": [ + "code" + ] + }, + { + "login": "gzog", + "name": "gzog", + "avatar_url": "https://avatars.githubusercontent.com/u/1487006?v=4", + "profile": "https://github.com/gzog", + "contributions": [ + "code", + "test", + "bug" + ] + }, + { + "login": "samcaspus", + "name": "samcaspus", + "avatar_url": "https://avatars.githubusercontent.com/u/19220113?v=4", + "profile": "https://github.com/samcaspus", + "contributions": [ + "code", + "ideas", + "test" + ] + }, + { + "login": "Tmunayyer", + "name": "Tmunayyer", + "avatar_url": "https://avatars.githubusercontent.com/u/29887304?v=4", + "profile": "https://github.com/Tmunayyer", + "contributions": [ + "code", + "bug" + ] + }, + { + "login": "adamb70", + "name": "adamb70", + "avatar_url": "https://avatars.githubusercontent.com/u/11885987?v=4", + "profile": "https://github.com/adamb70", + "contributions": [ + "code", + "content", + "example", + "doc" + ] + }, + { + "login": "SanketDG", + "name": "SanketDG", + "avatar_url": "https://avatars.githubusercontent.com/u/8980971?v=4", + "profile": "https://github.com/SanketDG", + "contributions": [ + "code", + "design", + "test" + ] + }, + { + "login": "kpthatsme", + "name": "kpthatsme", + "avatar_url": "https://avatars.githubusercontent.com/u/5965891?v=4", + "profile": "https://github.com/kpthatsme", + "contributions": [ + "code" + ] + }, + { + "login": "J0", + "name": "J0", + "avatar_url": "https://avatars.githubusercontent.com/u/8011761?v=4", + "profile": "https://github.com/J0", + "contributions": [ + "code" + ] + }, + { + "login": "14MR", + "name": "14MR", + "avatar_url": "https://avatars.githubusercontent.com/u/5824170?v=4", + "profile": "https://github.com/14MR", + "contributions": [ + "code", + "doc", + "ideas", + "bug" + ] + }, + { + "login": "03difoha", + "name": "03difoha", + "avatar_url": "https://avatars.githubusercontent.com/u/8876615?v=4", + "profile": "https://github.com/03difoha", + "contributions": [ + "code" + ] + }, + { + "login": "ahtik", + "name": "ahtik", + "avatar_url": "https://avatars.githubusercontent.com/u/140952?v=4", + "profile": "https://github.com/ahtik", + "contributions": [ + "code" + ] + }, + { + "login": "Algogator", + "name": "Algogator", + "avatar_url": "https://avatars.githubusercontent.com/u/1433469?v=4", + "profile": "https://github.com/Algogator", + "contributions": [ + "code" + ] + }, + { + "login": "GalDayan", + "name": "GalDayan", + "avatar_url": "https://avatars.githubusercontent.com/u/24251369?v=4", + "profile": "https://github.com/GalDayan", + "contributions": [ + "code" + ] + }, + { + "login": "Kacppian", + "name": "Kacppian", + "avatar_url": "https://avatars.githubusercontent.com/u/14990078?v=4", + "profile": "https://github.com/Kacppian", + "contributions": [ + "code" + ] + }, + { + "login": "FUSAKLA", + "name": "FUSAKLA", + "avatar_url": "https://avatars.githubusercontent.com/u/6112562?v=4", + "profile": "https://github.com/FUSAKLA", + "contributions": [ + "code", + "ideas", + "bug", + "test", + "doc" + ] + }, + { + "login": "iMerica", + "name": "iMerica", + "avatar_url": "https://avatars.githubusercontent.com/u/487897?v=4", + "profile": "https://github.com/iMerica", + "contributions": [ + "code" + ] + }, + { + "login": "stevenphaedonos", + "name": "stevenphaedonos", + "avatar_url": "https://avatars.githubusercontent.com/u/12955616?v=4", + "profile": "https://github.com/stevenphaedonos", + "contributions": [ + "code", + "test", + "ideas", + "example", + "bug" + ] + }, + { + "login": "tapico-weyert", + "name": "tapico-weyert", + "avatar_url": "https://avatars.githubusercontent.com/u/70971917?v=4", + "profile": "https://github.com/tapico-weyert", + "contributions": [ + "code", + "plugin", + "bug", + "test", + "ideas" + ] + }, + { + "login": "adamschoenemann", + "name": "adamschoenemann", + "avatar_url": "https://avatars.githubusercontent.com/u/2095226?v=4", + "profile": "https://github.com/adamschoenemann", + "contributions": [ + "code" + ] + }, + { + "login": "AlexandreBonaventure", + "name": "AlexandreBonaventure", + "avatar_url": "https://avatars.githubusercontent.com/u/4596409?v=4", + "profile": "https://github.com/AlexandreBonaventure", + "contributions": [ + "code" + ] + }, + { + "login": "dan-dr", + "name": "dan-dr", + "avatar_url": "https://avatars.githubusercontent.com/u/6669808?v=4", + "profile": "https://github.com/dan-dr", + "contributions": [ + "code" + ] + }, + { + "login": "dts", + "name": "dts", + "avatar_url": "https://avatars.githubusercontent.com/u/273856?v=4", + "profile": "https://github.com/dts", + "contributions": [ + "code" + ] + }, + { + "login": "jamiehaywood", + "name": "jamiehaywood", + "avatar_url": "https://avatars.githubusercontent.com/u/26779712?v=4", + "profile": "https://github.com/jamiehaywood", + "contributions": [ + "code" + ] + }, + { + "login": "rushabhnagda11", + "name": "rushabhnagda11", + "avatar_url": "https://avatars.githubusercontent.com/u/3235568?v=4", + "profile": "https://github.com/rushabhnagda11", + "contributions": [ + "code" + ] + }, + { + "login": "weyert", + "name": "weyert", + "avatar_url": "https://avatars.githubusercontent.com/u/7049?v=4", + "profile": "https://github.com/weyert", + "contributions": [ + "code", + "plugin", + "bug", + "test", + "ideas" + ] + }, + { + "login": "casio", + "name": "casio", + "avatar_url": "https://avatars.githubusercontent.com/u/29784?v=4", + "profile": "https://github.com/casio", + "contributions": [ + "code" + ] + }, + { + "login": "Hungsiro506", + "name": "Hungsiro506", + "avatar_url": "https://avatars.githubusercontent.com/u/10346923?v=4", + "profile": "https://github.com/Hungsiro506", + "contributions": [ + "code" + ] + }, + { + "login": "bitbreakr", + "name": "bitbreakr", + "avatar_url": "https://avatars.githubusercontent.com/u/3123986?v=4", + "profile": "https://github.com/bitbreakr", + "contributions": [ + "code" + ] + }, + { + "login": "edmorley", + "name": "edmorley", + "avatar_url": "https://avatars.githubusercontent.com/u/501702?v=4", + "profile": "https://github.com/edmorley", + "contributions": [ + "code" + ] + }, + { + "login": "wundo", + "name": "wundo", + "avatar_url": "https://avatars.githubusercontent.com/u/113942?v=4", + "profile": "https://github.com/wundo", + "contributions": [ + "code" + ] + }, + { + "login": "andreipopovici", + "name": "andreipopovici", + "avatar_url": "https://avatars.githubusercontent.com/u/1143417?v=4", + "profile": "https://github.com/andreipopovici", + "contributions": [ + "code" + ] + }, + { + "login": "benjackwhite", + "name": "benjackwhite", + "avatar_url": "https://avatars.githubusercontent.com/u/2536520?v=4", + "profile": "https://github.com/benjackwhite", + "contributions": [ + "code" + ] + }, + { + "login": "serhey-dev", + "name": "serhey-dev", + "avatar_url": "https://avatars.githubusercontent.com/u/37838803?v=4", + "profile": "https://github.com/serhey-dev", + "contributions": [ + "code" + ] + }, + { + "login": "sjmadsen", + "name": "sjmadsen", + "avatar_url": "https://avatars.githubusercontent.com/u/57522?v=4", + "profile": "https://github.com/sjmadsen", + "contributions": [ + "code" + ] + }, + { + "login": "piemets", + "name": "piemets", + "avatar_url": "https://avatars.githubusercontent.com/u/70321811?v=4", + "profile": "https://github.com/piemets", + "contributions": [ + "code" + ] + }, + { + "login": "eltjehelene", + "name": "eltjehelene", + "avatar_url": "https://avatars.githubusercontent.com/u/75622766?v=4", + "profile": "https://github.com/eltjehelene", + "contributions": [ + "code" + ] + }, + { + "login": "athreyaanand", + "name": "athreyaanand", + "avatar_url": "https://avatars.githubusercontent.com/u/31478366?v=4", + "profile": "https://github.com/athreyaanand", + "contributions": [ + "code", + "doc", + "content" + ] + }, + { + "login": "berntgl", + "name": "berntgl", + "avatar_url": "https://avatars.githubusercontent.com/u/55957336?v=4", + "profile": "https://github.com/berntgl", + "contributions": [ + "code" + ] + }, + { + "login": "fakela", + "name": "fakela", + "avatar_url": "https://avatars.githubusercontent.com/u/39309699?v=4", + "profile": "https://github.com/fakela", + "contributions": [ + "code", + "doc" + ] + }, + { + "login": "seanpackham", + "name": "seanpackham", + "avatar_url": "https://avatars.githubusercontent.com/u/3830791?v=4", + "profile": "https://github.com/seanpackham", + "contributions": [ + "code" + ] + }, + { + "login": "corywatilo", + "name": "corywatilo", + "avatar_url": "https://avatars.githubusercontent.com/u/154479?v=4", + "profile": "https://github.com/corywatilo", + "contributions": [ + "code" + ] + }, + { + "login": "mikenicklas", + "name": "mikenicklas", + "avatar_url": "https://avatars.githubusercontent.com/u/6363580?v=4", + "profile": "https://github.com/mikenicklas", + "contributions": [ + "code" + ] + }, + { + "login": "lottiecoxon", + "name": "lottiecoxon", + "avatar_url": "https://avatars.githubusercontent.com/u/65415371?v=4", + "profile": "https://github.com/lottiecoxon", + "contributions": [ + "code" + ] + }, + { + "login": "oshura3", + "name": "oshura3", + "avatar_url": "https://avatars.githubusercontent.com/u/30472479?v=4", + "profile": "https://github.com/oshura3", + "contributions": [ + "code", + "doc" + ] + }, + { + "login": "Abo7atm", + "name": "Abo7atm", + "avatar_url": "https://avatars.githubusercontent.com/u/33042538?v=4", + "profile": "https://github.com/Abo7atm", + "contributions": [ + "code" + ] + }, + { + "login": "brianetaveras", + "name": "brianetaveras", + "avatar_url": "https://avatars.githubusercontent.com/u/52111440?v=4", + "profile": "https://github.com/brianetaveras", + "contributions": [ + "code" + ] + }, + { + "login": "callumgare", + "name": "callumgare", + "avatar_url": "https://avatars.githubusercontent.com/u/346340?v=4", + "profile": "https://github.com/callumgare", + "contributions": [ + "code" + ] + }, + { + "login": "RedFrez", + "name": "RedFrez", + "avatar_url": "https://avatars.githubusercontent.com/u/30352852?v=4", + "profile": "https://github.com/RedFrez", + "contributions": [ + "code" + ] + }, + { + "login": "cirdes", + "name": "cirdes", + "avatar_url": "https://avatars.githubusercontent.com/u/727781?v=4", + "profile": "https://github.com/cirdes", + "contributions": [ + "code" + ] + }, + { + "login": "DannyBen", + "name": "DannyBen", + "avatar_url": "https://avatars.githubusercontent.com/u/2405099?v=4", + "profile": "https://github.com/DannyBen", + "contributions": [ + "code" + ] + }, + { + "login": "sj26", + "name": "sj26", + "avatar_url": "https://avatars.githubusercontent.com/u/14028?v=4", + "profile": "https://github.com/sj26", + "contributions": [ + "code" + ] + }, + { + "login": "paulanunda", + "name": "paulanunda", + "avatar_url": "https://avatars.githubusercontent.com/u/155981?v=4", + "profile": "https://github.com/paulanunda", + "contributions": [ + "code" + ] + }, + { + "login": "arosales", + "name": "arosales", + "avatar_url": "https://avatars.githubusercontent.com/u/1707853?v=4", + "profile": "https://github.com/arosales", + "contributions": [ + "code" + ] + }, + { + "login": "ChandanSagar", + "name": "ChandanSagar", + "avatar_url": "https://avatars.githubusercontent.com/u/27363164?v=4", + "profile": "https://github.com/ChandanSagar", + "contributions": [ + "code" + ] + }, + { + "login": "wadenick", + "name": "wadenick", + "avatar_url": "https://avatars.githubusercontent.com/u/9014043?v=4", + "profile": "https://github.com/wadenick", + "contributions": [ + "code" + ] + }, + { + "login": "jgannondo", + "name": "jgannondo", + "avatar_url": "https://avatars.githubusercontent.com/u/28159071?v=4", + "profile": "https://github.com/jgannondo", + "contributions": [ + "code" + ] + }, + { + "login": "keladhruv", + "name": "keladhruv", + "avatar_url": "https://avatars.githubusercontent.com/u/30433468?v=4", + "profile": "https://github.com/keladhruv", + "contributions": [ + "code" + ] + }, + { + "login": "grellyd", + "name": "grellyd", + "avatar_url": "https://avatars.githubusercontent.com/u/7812612?v=4", + "profile": "https://github.com/grellyd", + "contributions": [ + "code" + ] + }, + { + "login": "rberrelleza", + "name": "rberrelleza", + "avatar_url": "https://avatars.githubusercontent.com/u/475313?v=4", + "profile": "https://github.com/rberrelleza", + "contributions": [ + "code" + ] + }, + { + "login": "annanay25", + "name": "annanay25", + "avatar_url": "https://avatars.githubusercontent.com/u/10982987?v=4", + "profile": "https://github.com/annanay25", + "contributions": [ + "code" + ] + }, + { + "login": "cohix", + "name": "cohix", + "avatar_url": "https://avatars.githubusercontent.com/u/5942370?v=4", + "profile": "https://github.com/cohix", + "contributions": [ + "code" + ] + }, + { + "login": "gouthamve", + "name": "gouthamve", + "avatar_url": "https://avatars.githubusercontent.com/u/7354143?v=4", + "profile": "https://github.com/gouthamve", + "contributions": [ + "code" + ] + }, + { + "login": "alexellis", + "name": "alexellis", + "avatar_url": "https://avatars.githubusercontent.com/u/6358735?v=4", + "profile": "https://github.com/alexellis", + "contributions": [ + "code" + ] + }, + { + "login": "prologic", + "name": "prologic", + "avatar_url": "https://avatars.githubusercontent.com/u/1290234?v=4", + "profile": "https://github.com/prologic", + "contributions": [ + "code" + ] + }, + { + "login": "jgustie", + "name": "jgustie", + "avatar_url": "https://avatars.githubusercontent.com/u/883981?v=4", + "profile": "https://github.com/jgustie", + "contributions": [ + "code" + ] + }, + { + "login": "kubemq", + "name": "kubemq", + "avatar_url": "https://avatars.githubusercontent.com/u/45835100?v=4", + "profile": "https://github.com/kubemq", + "contributions": [ + "code" + ] + }, + { + "login": "vania-pooh", + "name": "vania-pooh", + "avatar_url": "https://avatars.githubusercontent.com/u/829320?v=4", + "profile": "https://github.com/vania-pooh", + "contributions": [ + "code" + ] + }, + { + "login": "irespaldiza", + "name": "irespaldiza", + "avatar_url": "https://avatars.githubusercontent.com/u/11633327?v=4", + "profile": "https://github.com/irespaldiza", + "contributions": [ + "code" + ] + }, + { + "login": "croomes", + "name": "croomes", + "avatar_url": "https://avatars.githubusercontent.com/u/211994?v=4", + "profile": "https://github.com/croomes", + "contributions": [ + "code" + ] + }, + { + "login": "snormore", + "name": "snormore", + "avatar_url": "https://avatars.githubusercontent.com/u/182290?v=4", + "profile": "https://github.com/snormore", + "contributions": [ + "code" + ] + }, + { + "login": "faik", + "name": "faik", + "avatar_url": "https://avatars.githubusercontent.com/u/43129?v=4", + "profile": "https://github.com/faik", + "contributions": [ + "code" + ] + }, + { + "login": "aandryashin", + "name": "aandryashin", + "avatar_url": "https://avatars.githubusercontent.com/u/1412461?v=4", + "profile": "https://github.com/aandryashin", + "contributions": [ + "code" + ] + }, + { + "login": "andrewsomething", + "name": "andrewsomething", + "avatar_url": "https://avatars.githubusercontent.com/u/46943?v=4", + "profile": "https://github.com/andrewsomething", + "contributions": [ + "code" + ] + }, + { + "login": "Ferroin", + "name": "Ferroin", + "avatar_url": "https://avatars.githubusercontent.com/u/905151?v=4", + "profile": "https://github.com/Ferroin", + "contributions": [ + "code" + ] + }, + { + "login": "cpanato", + "name": "cpanato", + "avatar_url": "https://avatars.githubusercontent.com/u/4115580?v=4", + "profile": "https://github.com/cpanato", + "contributions": [ + "code" + ] + }, + { + "login": "cakrit", + "name": "cakrit", + "avatar_url": "https://avatars.githubusercontent.com/u/43294513?v=4", + "profile": "https://github.com/cakrit", + "contributions": [ + "code" + ] + }, + { + "login": "dkhenry", + "name": "dkhenry", + "avatar_url": "https://avatars.githubusercontent.com/u/489643?v=4", + "profile": "https://github.com/dkhenry", + "contributions": [ + "code" + ] + }, + { + "login": "oxplot", + "name": "oxplot", + "avatar_url": "https://avatars.githubusercontent.com/u/483682?v=4", + "profile": "https://github.com/oxplot", + "contributions": [ + "code" + ] + }, + { + "login": "marc-barry", + "name": "marc-barry", + "avatar_url": "https://avatars.githubusercontent.com/u/4965634?v=4", + "profile": "https://github.com/marc-barry", + "contributions": [ + "code" + ] + }, + { + "login": "moabu", + "name": "moabu", + "avatar_url": "https://avatars.githubusercontent.com/u/47318409?v=4", + "profile": "https://github.com/moabu", + "contributions": [ + "code" + ] + }, + { + "login": "nawazdhandala", + "name": "nawazdhandala", + "avatar_url": "https://avatars.githubusercontent.com/u/2697338?v=4", + "profile": "https://github.com/nawazdhandala", + "contributions": [ + "code" + ] + }, + { + "login": "dar-mehta", + "name": "dar-mehta", + "avatar_url": "https://avatars.githubusercontent.com/u/10489943?v=4", + "profile": "https://github.com/dar-mehta", + "contributions": [ + "code" + ] + }, + { + "login": "gmmorris", + "name": "gmmorris", + "avatar_url": "https://avatars.githubusercontent.com/u/386208?v=4", + "profile": "https://github.com/gmmorris", + "contributions": [ + "code" + ] + }, + { + "login": "bitdeli-chef", + "name": "bitdeli-chef", + "avatar_url": "https://avatars.githubusercontent.com/u/3092978?v=4", + "profile": "https://github.com/bitdeli-chef", + "contributions": [ + "code" + ] + }, + { + "login": "nsidartha", + "name": "nsidartha", + "avatar_url": "https://avatars.githubusercontent.com/u/26918226?v=4", + "profile": "https://github.com/nsidartha", + "contributions": [ + "code" + ] + }, + { + "login": "bard", + "name": "Massimiliano Mirra", + "avatar_url": "https://avatars.githubusercontent.com/u/19322?v=4", + "profile": "http://massimilianomirra.com/", + "contributions": [ + "code", + "doc" + ] + }, + { + "login": "bronsonavila", + "name": "Bronson Avila", + "avatar_url": "https://avatars.githubusercontent.com/u/30540995?v=4", + "profile": "https://www.bronsonavila.com/", + "contributions": [ + "code", + "doc" + ] + }, + { + "login": "posthog-contributions-bot[bot]", + "name": "posthog-contributions-bot[bot]", + "avatar_url": "https://avatars.githubusercontent.com/in/105985?v=4", + "profile": "https://github.com/apps/posthog-contributions-bot", + "contributions": [ + "code" + ] + }, + { + "login": "joesaunderson", + "name": "Joe Saunderson", + "avatar_url": "https://avatars.githubusercontent.com/u/11272509?v=4", + "profile": "https://github.com/joesaunderson", + "contributions": [ + "ideas", + "code" + ] + }, + { + "login": "MrSaints", + "name": "Ian L.", + "avatar_url": "https://avatars.githubusercontent.com/u/68859?v=4", + "profile": "https://www.ianlai.dev/", + "contributions": [ + "ideas", + "userTesting" + ] + }, + { + "login": "shogunpurple", + "name": "Martin McKeaveney", + "avatar_url": "https://avatars.githubusercontent.com/u/11256663?v=4", + "profile": "http://martinmck.com", + "contributions": [ + "ideas" + ] + }, + { + "login": "lharress", + "name": "Lleo Harress", + "avatar_url": "https://avatars.githubusercontent.com/u/13482930?v=4", + "profile": "https://github.com/lharress", + "contributions": [ + "ideas", + "test", + "code" + ] + }, + { + "login": "adrienbrault", + "name": "Adrien Brault", + "avatar_url": "https://avatars.githubusercontent.com/u/611271?v=4", + "profile": "https://www.linkedin.com/in/adrien-brault-4b987426/", + "contributions": [ + "code", + "test", + "bug", + "content", + "doc" + ] + }, + { + "login": "leggetter", + "name": "Phil Leggetter", + "avatar_url": "https://avatars.githubusercontent.com/u/328367?v=4", + "profile": "https://leggetter.co.uk", + "contributions": [ + "code", + "doc" + ] + }, + { + "login": "wushaobo", + "name": "Shaobo Wu", + "avatar_url": "https://avatars.githubusercontent.com/u/491264?v=4", + "profile": "https://github.com/wushaobo", + "contributions": [ + "question" + ] + }, + { + "login": "jonathanclarke", + "name": "Jonathan Clarke", + "avatar_url": "https://avatars.githubusercontent.com/u/11335?v=4", + "profile": "http://www.jonathanclarke.ie", + "contributions": [ + "code" + ] + }, + { + "login": "imgbot[bot]", + "name": "imgbot[bot]", + "avatar_url": "https://avatars.githubusercontent.com/in/4706?v=4", + "profile": "https://github.com/apps/imgbot", + "contributions": [ + "code" + ] + }, + { + "login": "well-balanced", + "name": "Woosik Kim", + "avatar_url": "https://avatars.githubusercontent.com/u/48206623?v=4", + "profile": "http://well-balanced.medium.com", + "contributions": [ + "code" + ] + }, + { + "login": "jeduden", + "name": "jeduden", + "avatar_url": "https://avatars.githubusercontent.com/u/1117699?v=4", + "profile": "https://github.com/jeduden", + "contributions": [ + "ideas", + "code", + "doc" + ] + }, + { + "login": "gempain", + "name": "Geoffroy Empain", + "avatar_url": "https://avatars.githubusercontent.com/u/13135149?v=4", + "profile": "https://github.com/gempain", + "contributions": [ + "userTesting" + ] + }, + { + "login": "rethab", + "name": "rethab", + "avatar_url": "https://avatars.githubusercontent.com/u/2222044?v=4", + "profile": "https://github.com/rethab", + "contributions": [ + "code", + "doc" + ] + }, + { + "login": "daviddanielarch", + "name": "David Arch", + "avatar_url": "https://avatars.githubusercontent.com/u/78377120?v=4", + "profile": "https://daviddanielarch.github.io/", + "contributions": [ + "code" + ] + }, + { + "login": "angelahuang89", + "name": "Angela Huang", + "avatar_url": "https://avatars.githubusercontent.com/u/22755100?v=4", + "profile": "https://github.com/angelahuang89", + "contributions": [ + "code" + ] + }, + { + "login": "kevinhu", + "name": "Kevin Hu", + "avatar_url": "https://avatars.githubusercontent.com/u/6051736?v=4", + "profile": "http://kevinhu.io", + "contributions": [ + "code" + ] + }, + { + "login": "afterwind-io", + "name": "Afterwind", + "avatar_url": "https://avatars.githubusercontent.com/u/16891493?v=4", + "profile": "https://github.com/afterwind-io", + "contributions": [ + "code" + ] + }, + { + "login": "swong194", + "name": "Sunny Wong", + "avatar_url": "https://avatars.githubusercontent.com/u/25137899?v=4", + "profile": "https://github.com/swong194", + "contributions": [ + "code" + ] + }, + { + "login": "Rajakavitha1", + "name": "Rajakavitha1", + "avatar_url": "https://avatars.githubusercontent.com/u/37059749?v=4", + "profile": "http://rajie.space", + "contributions": [ + "doc", + "bug", + "code" + ] + }, + { + "login": "thedeveloperr", + "name": "Mohit Gupta", + "avatar_url": "https://avatars.githubusercontent.com/u/23462580?v=4", + "profile": "https://github.com/thedeveloperr", + "contributions": [ + "code" + ] + }, + { + "login": "gesposito", + "name": "Gianluca Esposito", + "avatar_url": "https://avatars.githubusercontent.com/u/735227?v=4", + "profile": "https://www.esposi.to", + "contributions": [ + "code" + ] + }, + { + "login": "sankalpdomore", + "name": "Sankalp Sinha", + "avatar_url": "https://avatars.githubusercontent.com/u/18334593?v=4", + "profile": "http://www.sankalpsinha.com", + "contributions": [ + "code", + "design" + ] + }, + { + "login": "gagantrivedi", + "name": "gagandeep trivedi", + "avatar_url": "https://avatars.githubusercontent.com/u/18366226?v=4", + "profile": "https://github.com/gagantrivedi", + "contributions": [ + "code" + ] + }, + { + "login": "j-fuentes", + "name": "Jose Fuentes Castillo", + "avatar_url": "https://avatars.githubusercontent.com/u/10594577?v=4", + "profile": "https://github.com/j-fuentes", + "contributions": [ + "code" + ] + }, + { + "login": "akshayagarwal", + "name": "Akshay Agarwal", + "avatar_url": "https://avatars.githubusercontent.com/u/1273012?v=4", + "profile": "http://in.linkedin.com/in/akshayagr", + "contributions": [ + "code" + ] + }, + { + "login": "JeffreyQ", + "name": "Jeff Qiu", + "avatar_url": "https://avatars.githubusercontent.com/u/10890152?v=4", + "profile": "https://github.com/JeffreyQ", + "contributions": [ + "code" + ] + }, + { + "login": "ConradKurth", + "name": "Conrad", + "avatar_url": "https://avatars.githubusercontent.com/u/1794593?v=4", + "profile": "https://github.com/ConradKurth", + "contributions": [ + "code", + "plugin" + ] + }, + { + "login": "avorio", + "name": "André Avorio", + "avatar_url": "https://avatars.githubusercontent.com/u/649020?v=4", + "profile": "http://avor.io", + "contributions": [ + "bug" + ] + }, + { + "login": "tobiastornros", + "name": "Tobias Törnros", + "avatar_url": "https://avatars.githubusercontent.com/u/17402497?v=4", + "profile": "https://github.com/tobiastornros", + "contributions": [ + "bug", + "doc", + "code" + ] + }, + { + "login": "abhijitghate", + "name": "abhijitghate", + "avatar_url": "https://avatars.githubusercontent.com/u/11834249?v=4", + "profile": "https://github.com/abhijitghate", + "contributions": [ + "code" + ] + }, + { + "login": "c3ho", + "name": "Calvin ", + "avatar_url": "https://avatars.githubusercontent.com/u/18711727?v=4", + "profile": "https://c3ho.blogspot.com/", + "contributions": [ + "code" + ] + }, + { + "login": "DimitrisMazarakis", + "name": "Dimitris Mazarakis", + "avatar_url": "https://avatars.githubusercontent.com/u/56391437?v=4", + "profile": "https://github.com/DimitrisMazarakis", + "contributions": [ + "code" + ] + }, + { + "login": "pietrodevpiccini", + "name": "pietrodevpiccini", + "avatar_url": "https://avatars.githubusercontent.com/u/78323924?v=4", + "profile": "https://github.com/pietrodevpiccini", + "contributions": [ + "code", + "test", + "ideas" + ] + }, + { + "login": "mands", + "name": "Mandeep Gill", + "avatar_url": "https://avatars.githubusercontent.com/u/1010043?v=4", + "profile": "https://github.com/mands", + "contributions": [ + "code" + ] + }, + { + "login": "lutangar", + "name": "Johan Dufour", + "avatar_url": "https://avatars.githubusercontent.com/u/568769?v=4", + "profile": "https://larriereguichet.fr", + "contributions": [ + "code" + ] + }, + { + "login": "taobojlen", + "name": "Tao Bojlén", + "avatar_url": "https://avatars.githubusercontent.com/u/66130243?v=4", + "profile": "https://www.btao.org/", + "contributions": [ + "code", + "ideas" + ] + }, + { + "login": "marcushyett-ph", + "name": "Marcus Hyett (PostHog)", + "avatar_url": "https://avatars.githubusercontent.com/u/85295485?v=4", + "profile": "https://github.com/marcushyett-ph", + "contributions": [ + "code" + ] + }, + { + "login": "jonataslaw", + "name": "Jonny Borges", + "avatar_url": "https://avatars.githubusercontent.com/u/35742643?v=4", + "profile": "https://github.com/jonataslaw", + "contributions": [ + "code" + ] + }, + { + "login": "neilkakkar", + "name": "Neil Kakkar", + "avatar_url": "https://avatars.githubusercontent.com/u/7115141?v=4", + "profile": "http://neilkakkar.com", + "contributions": [ + "test", + "bug" + ] + }, + { + "login": "dbinetti", + "name": "David Binetti", + "avatar_url": "https://avatars.githubusercontent.com/u/161722?v=4", + "profile": "https://www.dbinetti.com", + "contributions": [ + "doc", + "ideas" + ] + }, + { + "login": "smallbrownbike", + "name": "Eli Kinsey", + "avatar_url": "https://avatars.githubusercontent.com/u/28248250?v=4", + "profile": "http://ekinsey.dev", + "contributions": [ + "code" + ] + }, + { + "login": "marcopchen", + "name": "Marco Chen", + "avatar_url": "https://avatars.githubusercontent.com/u/33271308?v=4", + "profile": "https://www.marcopchen.com/", + "contributions": [ + "code" + ] + }, + { + "login": "chidexebere", + "name": "Chidiebere Onyegbuchulem", + "avatar_url": "https://avatars.githubusercontent.com/u/25040059?v=4", + "profile": "https://conye.netlify.app/", + "contributions": [ + "code" + ] + }, + { + "login": "RayBB", + "name": "Raymond Berger", + "avatar_url": "https://avatars.githubusercontent.com/u/921217?v=4", + "profile": "http://raybb.github.io", + "contributions": [ + "code" + ] + }, + { + "login": "tirkarthi", + "name": "Karthikeyan Singaravelan", + "avatar_url": "https://avatars.githubusercontent.com/u/3972343?v=4", + "profile": "http://tirkarthi.github.io", + "contributions": [ + "code" + ] + }, + { + "login": "jacobherrington", + "name": "Jacob Herrington", + "avatar_url": "https://avatars.githubusercontent.com/u/11466782?v=4", + "profile": "https://dev.to/jacobherrington", + "contributions": [ + "code" + ] + }, + { + "login": "imhmdb", + "name": "Mohamad Bahamdain", + "avatar_url": "https://avatars.githubusercontent.com/u/34659256?v=4", + "profile": "https://mhmd.dev", + "contributions": [ + "code" + ] + }, + { + "login": "alx-a", + "name": "alx-a", + "avatar_url": "https://avatars.githubusercontent.com/u/26557823?v=4", + "profile": "https://github.com/alx-a", + "contributions": [ + "code" + ] + }, + { + "login": "Patil2099", + "name": "Pankaj Patil", + "avatar_url": "https://avatars.githubusercontent.com/u/35653876?v=4", + "profile": "https://pplife.home.blog", + "contributions": [ + "code" + ] + }, + { + "login": "purcell3a", + "name": "Angela Purcell ", + "avatar_url": "https://avatars.githubusercontent.com/u/62629855?v=4", + "profile": "http://purcell3a.github.io", + "contributions": [ + "code" + ] + }, + { + "login": "jredl-va", + "name": "Jesse Redl", + "avatar_url": "https://avatars.githubusercontent.com/u/2300103?v=4", + "profile": "http://www.vendasta.com/", + "contributions": [ + "code" + ] + }, + { + "login": "7MIMIRA", + "name": "Jose Lopez", + "avatar_url": "https://avatars.githubusercontent.com/u/63031501?v=4", + "profile": "https://github.com/7MIMIRA", + "contributions": [ + "code" + ] + }, + { + "login": "juanvasquezreyes", + "name": "juanvasquezreyes", + "avatar_url": "https://avatars.githubusercontent.com/u/20667703?v=4", + "profile": "https://github.com/juanvasquezreyes", + "contributions": [ + "code" + ] + }, + { + "login": "bryanyi", + "name": "bryanyi", + "avatar_url": "https://avatars.githubusercontent.com/u/66971225?v=4", + "profile": "http://bryanyi.com", + "contributions": [ + "code" + ] + }, + { + "login": "benbz", + "name": "Ben Banfield-Zanin", + "avatar_url": "https://avatars.githubusercontent.com/u/1325121?v=4", + "profile": "https://github.com/benbz", + "contributions": [ + "code" + ] + }, + { + "login": "julianharty", + "name": "Julian Harty", + "avatar_url": "https://avatars.githubusercontent.com/u/785891?v=4", + "profile": "http://blog.bettersoftwaretesting.com", + "contributions": [ + "doc" + ] + }, + { + "login": "bencgreenberg", + "name": "Ben Greenberg", + "avatar_url": "https://avatars.githubusercontent.com/u/13892277?v=4", + "profile": "http://www.bengreenberg.dev", + "contributions": [ + "code" + ] + }, + { + "login": "xahhy", + "name": "Herb", + "avatar_url": "https://avatars.githubusercontent.com/u/8667086?v=4", + "profile": "https://github.com/xahhy", + "contributions": [ + "code" + ] + }, + { + "login": "manish001in", + "name": "Manish Rastogi", + "avatar_url": "https://avatars.githubusercontent.com/u/7192261?v=4", + "profile": "https://github.com/manish001in", + "contributions": [ + "code" + ] + }, + { + "login": "jmellicker", + "name": "jmellicker", + "avatar_url": "https://avatars.githubusercontent.com/u/8551583?v=4", + "profile": "https://github.com/jmellicker", + "contributions": [ + "code" + ] + }, + { + "login": "mjashanks", + "name": "Michael Shanks", + "avatar_url": "https://avatars.githubusercontent.com/u/3524181?v=4", + "profile": "https://budibase.com", + "contributions": [ + "code" + ] + }, + { + "login": "edhgoose", + "name": "Edward Hartwell Goose", + "avatar_url": "https://avatars.githubusercontent.com/u/1108173?v=4", + "profile": "https://github.com/edhgoose", + "contributions": [ + "code" + ] + }, + { + "login": "steveyackey", + "name": "Steve Yackey", + "avatar_url": "https://avatars.githubusercontent.com/u/61758723?v=4", + "profile": "https://github.com/steveyackey", + "contributions": [ + "code" + ] + }, + { + "login": "asherf", + "name": "Asher Foa", + "avatar_url": "https://avatars.githubusercontent.com/u/1268088?v=4", + "profile": "https://github.com/asherf", + "contributions": [ + "code" + ] + }, + { + "login": "leoMehlig", + "name": "Leo Mehlig", + "avatar_url": "https://avatars.githubusercontent.com/u/9119485?v=4", + "profile": "https://twitter.com/leoMehlig", + "contributions": [ + "code" + ] + }, + { + "login": "banagale", + "name": "Rob Banagale", + "avatar_url": "https://avatars.githubusercontent.com/u/1409710?v=4", + "profile": "https://banagale.com", + "contributions": [ + "code" + ] + }, + { + "login": "skabbes", + "name": "Steven Kabbes", + "avatar_url": "https://avatars.githubusercontent.com/u/592178?v=4", + "profile": "https://github.com/skabbes", + "contributions": [ + "code" + ] + }, + { + "login": "csykes", + "name": "Chad Sykes", + "avatar_url": "https://avatars.githubusercontent.com/u/944809?v=4", + "profile": "https://github.com/csykes", + "contributions": [ + "code" + ] + }, + { + "login": "stefnnn", + "name": "Stefan N", + "avatar_url": "https://avatars.githubusercontent.com/u/6664911?v=4", + "profile": "https://github.com/stefnnn", + "contributions": [ + "code" + ] + }, + { + "login": "terrymunro", + "name": "Terence Munro", + "avatar_url": "https://avatars.githubusercontent.com/u/2586778?v=4", + "profile": "https://www.literacyplanet.com.au", + "contributions": [ + "code" + ] + }, + { + "login": "dakshshah96", + "name": "Daksh Shah", + "avatar_url": "https://avatars.githubusercontent.com/u/7896438?v=4", + "profile": "https://daksh.me", + "contributions": [ + "code" + ] + }, + { + "login": "geary", + "name": "Michael Geary", + "avatar_url": "https://avatars.githubusercontent.com/u/21968?v=4", + "profile": "http://mg.to/", + "contributions": [ + "code" + ] + }, + { + "login": "jyuvaraj03", + "name": "Yuvaraj J", + "avatar_url": "https://avatars.githubusercontent.com/u/29891001?v=4", + "profile": "http://jyuvaraj03.github.io", + "contributions": [ + "code" + ] + }, + { + "login": "mether", + "name": "Mack Etherington", + "avatar_url": "https://avatars.githubusercontent.com/u/13096366?v=4", + "profile": "http://mackenziee.com", + "contributions": [ + "code" + ] + }, + { + "login": "Jaspreet-singh-1032", + "name": "Jaspreet singh", + "avatar_url": "https://avatars.githubusercontent.com/u/69707565?v=4", + "profile": "https://github.com/Jaspreet-singh-1032", + "contributions": [ + "code" + ] + }, + { + "login": "inbreaks", + "name": "Le Ding", + "avatar_url": "https://avatars.githubusercontent.com/u/1317194?v=4", + "profile": "https://github.com/inbreaks", + "contributions": [ + "code" + ] + }, + { + "login": "xrendan", + "name": "Brendan Samek", + "avatar_url": "https://avatars.githubusercontent.com/u/13208566?v=4", + "profile": "https://github.com/xrendan", + "contributions": [ + "code" + ] + }, + { + "login": "Nishant-Sagar", + "name": "Nishant Sagar", + "avatar_url": "https://avatars.githubusercontent.com/u/66466895?v=4", + "profile": "https://github.com/Nishant-Sagar", + "contributions": [ + "content" + ] + }, + { + "login": "romj", + "name": "Jean Roman", + "avatar_url": "https://avatars.githubusercontent.com/u/44766458?v=4", + "profile": "https://github.com/romj", + "contributions": [ + "code" + ] + }, + { + "login": "pixlwave", + "name": "Doug", + "avatar_url": "https://avatars.githubusercontent.com/u/6060466?v=4", + "profile": "https://pixlwave.uk/", + "contributions": [ + "code" + ] + } + ], + "contributorsPerLine": 7, + "projectName": "posthog", + "projectOwner": "PostHog", + "repoType": "github", + "repoHost": "https://github.com", + "skipCi": true +} diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000000..4af802f0e5612 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,10 @@ +[run] +source = + posthog/ + ee/ + +branch = true + +omit = + */migrations/* + manage.py diff --git a/.devcontainer/container_start.sh b/.devcontainer/container_start.sh new file mode 100755 index 0000000000000..3b537ae8554f7 --- /dev/null +++ b/.devcontainer/container_start.sh @@ -0,0 +1,8 @@ +#!/bin/bash +#set -e + +apt-get remove -y docker docker.io containerd runc +apt-get update +apt install -y docker.io +apt autoremove -y +echo "printf 'Hello 🦔! To start PostHog run this:\n "./ee/bin/docker-ch-dev-web"\n'" > ~/.bashrc \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000000000..de3a90b19bf2b --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,39 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the README at: +// https://github.com/microsoft/vscode-dev-containers/tree/v0.183.0/containers/docker-existing-docker-compose +// If you want to run as a non-root user in the container, see .devcontainer/docker-compose.yml. +{ + "name": "Existing Docker Compose (Extend)", + + // Update the 'dockerComposeFile' list if you have more compose files or use different names. + // The .devcontainer/docker-compose.yml file contains any overrides you need/want to make. + "dockerComposeFile": ["../ee/docker-compose.ch.yml", "docker-compose.yml"], + + // The 'service' property is the name of the service for the container that VS Code should + // use. Update this value and .devcontainer/docker-compose.yml to the real service name. + "service": "web", + + // The optional 'workspaceFolder' property is the path VS Code should open by default when + // connected. This is typically a file mount in .devcontainer/docker-compose.yml + "workspaceFolder": "/workspace", + + // Set *default* container specific settings.json values on container create. + "settings": {}, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": [], + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + "forwardPorts": [8000, 5432, 6379, 8123, 8234, 9000, 9092, 9440, 9009], + + // Uncomment the next line if you want start specific services in your Docker Compose config. + // "runServices": [], + + // Uncomment the next line if you want to keep your containers running after VS Code shuts down. + // "shutdownAction": "none", + + // Uncomment the next line to run commands after the container is created - for example installing curl. + "postCreateCommand": "./.devcontainer/container_start.sh" + + // Uncomment to connect as a non-root user if you've added one. See https://aka.ms/vscode-remote/containers/non-root. + // "remoteUser": "vscode" +} diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/docker-compose.yml new file mode 100644 index 0000000000000..8c37c2fdcc3b8 --- /dev/null +++ b/.devcontainer/docker-compose.yml @@ -0,0 +1,37 @@ +version: '3' +services: + # Update this to the name of the service you want to work with in your docker-compose.yml file + web: + # If you want add a non-root user to your Dockerfile, you can use the "remoteUser" + # property in devcontainer.json to cause VS Code its sub-processes (terminals, tasks, + # debugging) to execute as the user. Uncomment the next line if you want the entire + # container to run as this user instead. Note that, on Linux, you may need to + # ensure the UID and GID of the container user you create matches your local user. + # See https://aka.ms/vscode-remote/containers/non-root for details. + # + # user: vscode + + # Uncomment if you want to override the service's Dockerfile to one in the .devcontainer + # folder. Note that the path of the Dockerfile and context is relative to the *primary* + # docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile" + # array). The sample below assumes your primary file is in the root of your project. + # + # build: + # context: . + # dockerfile: .devcontainer/Dockerfile + + volumes: + # Update this to wherever you want VS Code to mount the folder of your project + - .:/workspace:cached + + # Uncomment the next line to use Docker from inside the container. See https://aka.ms/vscode-remote/samples/docker-from-docker-compose for details. + - /var/run/docker.sock:/var/run/docker.sock + + # Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust. + # cap_add: + # - SYS_PTRACE + # security_opt: + # - seccomp:unconfined + + # Overrides default command so things don't shut down after the process ends. + command: /bin/sh -c "while sleep 1000; do :; done" diff --git a/.dockerignore b/.dockerignore index aa8e3c8e8c793..166e4dc7925ec 100644 --- a/.dockerignore +++ b/.dockerignore @@ -18,4 +18,5 @@ !frontend/src !frontend/types !frontend/public -!ee \ No newline at end of file +!frontend/*.* +!ee diff --git a/.eslintrc.js b/.eslintrc.js index bb232b85d3b82..bb200f0ab16fd 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -2,6 +2,7 @@ /* global module */ module.exports = { + ignorePatterns: ['node_modules', 'plugin-server'], env: { browser: true, es6: true, @@ -12,7 +13,7 @@ module.exports = { version: 'detect', }, }, - extends: ['plugin:@typescript-eslint/recommended', 'plugin:react/recommended', 'prettier/@typescript-eslint'], + extends: ['plugin:@typescript-eslint/recommended', 'plugin:react/recommended', 'prettier'], globals: { Atomics: 'readonly', SharedArrayBuffer: 'readonly', @@ -42,6 +43,10 @@ module.exports = { '@typescript-eslint/explicit-module-boundary-types': 'off', '@typescript-eslint/no-empty-function': 'off', '@typescript-eslint/no-inferrable-types': 'off', + '@typescript-eslint/ban-ts-comment': 'off', + 'no-shadow': 'error', + '@typescript-eslint/no-non-null-assertion': 'error', + curly: 'error', }, overrides: [ { @@ -74,7 +79,7 @@ module.exports = { { files: ['*.js'], rules: { - 'typescript/no-var-requires': 'off', + '@typescript-eslint/no-var-requires': 'off', }, }, ], diff --git a/.flake8 b/.flake8 index 86a993f964160..8d522e1e6edd3 100644 --- a/.flake8 +++ b/.flake8 @@ -5,3 +5,4 @@ import-order-style = pycharm max-line-length = 127 max-complexity = 10 +select=E9,F63,F7,F82,W605 \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000..507cd01a11ecf --- /dev/null +++ b/.gitattributes @@ -0,0 +1,7 @@ +* text=auto + +.gitattributes export-ignore +.gitignore export-ignore +.github/ export-ignore +CONTRIBUTING.md export-ignore +CODE_OF_CONDUCT.md export-ignore diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 3937acd7dbf2f..98b334c444810 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,9 +1,7 @@ --- name: Bug report -about: Create a report to help us improve -title: '' +about: Something not working as expected? Let us look into it labels: bug - --- ## Bug description @@ -17,14 +15,15 @@ labels: bug ## How to reproduce -1. -2. -3. +1. +2. +3. ## Environment -- PostHog cloud or self-managed? -- PostHog version/commit +- [ ] PostHog Cloud +- [ ] self-hosted PostHog (ClickHouse-based), version/commit: _please provide_ +- [ ] self-hosted PostHog (Postgres-based, legacy), version/commit: _please provide_ ## Additional context diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 1010078a14bc0..24533264c3b99 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,8 +1,7 @@ --- name: Feature request -about: Suggest an idea for PostHog -title: '' -labels: enhancement +about: Suggest a feature for PostHog +labels: enhancement, feature --- diff --git a/.github/ISSUE_TEMPLATE/performance_issue_report.md b/.github/ISSUE_TEMPLATE/performance_issue_report.md new file mode 100644 index 0000000000000..871dc39f15a5c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/performance_issue_report.md @@ -0,0 +1,26 @@ +--- +name: Performance issue report +about: Long response times, high resource usage? Ensuring PostHog's scalable is our top priority +labels: performance +--- + +## In what situation are you experiencing subpar performance? + +*Please describe.* + +## How to reproduce + +1. +2. +3. + +## Environment + +- [ ] PostHog Cloud +- [ ] self-hosted PostHog, version/commit: _please provide_ + +## Additional context + + + +#### *Thank you* for your performance issue report – we want PostHog to go supersonic! diff --git a/.github/ISSUE_TEMPLATE/sprint_planning_retro.md b/.github/ISSUE_TEMPLATE/sprint_planning_retro.md new file mode 100644 index 0000000000000..d68971fab84da --- /dev/null +++ b/.github/ISSUE_TEMPLATE/sprint_planning_retro.md @@ -0,0 +1,67 @@ +--- +name: Sprint planning with retrospective +about: (internal) +labels: sprint +title: Sprint 1.n.0 m/2 - Jan 1 to Jan 12 +--- + +# Global Sprint Planning + +## Retro: Status of Outcomes from Previous Sprint + +1. +2. +3. +4. +5. + +## Retro: What can we do better next sprint? + +1. +2. +3. +4. +5. + +## Plan: Proposed Goals for Next Sprint + +Each goal should have a single owner. Owner can only be an engineer. + +1. +2. +3. +4. +5. + +# Team sprint planning + +For your team sprint planning copy this template into a comment below for each team. + +``` +Team ___ + +## Retro + + + +- + +## Hang over items from previous sprint + + + +- Item 1. prioritised/deprioritise + +## Planning + + + +### High priority + +- + +### Low priority / side quests + +- + +``` diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 735c89465881f..71c2803c71f94 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,10 +1,8 @@ ## Changes *Please describe.* -*If this affects the front-end, include screenshots.* +*If this affects the frontend, include screenshots.* -## Checklist +## How did you test this code? -- [ ] All querysets/queries filter by Organization, Team, and User (if this PR affects ANY querysets/queries). -- [ ] Django backend tests (if this PR affects the backend). -- [ ] Cypress end-to-end tests (if this PR affects the frontend). +*Please describe.* diff --git a/.github/workflows/auto-image.yml b/.github/workflows/auto-image.yml deleted file mode 100644 index 30501640c3dad..0000000000000 --- a/.github/workflows/auto-image.yml +++ /dev/null @@ -1,24 +0,0 @@ -name: Create DigitalOcean Image - -on: - push: - tags: - - '*.*.*' - -jobs: - build: - name: Create and deploy DigitalOcean image - runs-on: ubuntu-20.04 - steps: - - name: Build and deploy image - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - DIGITALOCEAN_TOKEN: ${{ secrets.DIGITALOCEAN_TOKEN }} - run: | - wget https://releases.hashicorp.com/packer/1.6.0/packer_1.6.0_linux_amd64.zip - && unzip packer_1.6.0_linux_amd64.zip - && sudo mv packer /usr/bin/ - && sudo chmod +x /usr/bin/packer - && git clone https://github.com/posthog/deployment.git \ - && cd deployment/packer/digitalocean/single_node \ - && packer build digitalocean.json diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml new file mode 100644 index 0000000000000..2dfd0aaec195c --- /dev/null +++ b/.github/workflows/automerge.yml @@ -0,0 +1,35 @@ +name: Automerge + +env: + MERGE_METHOD: 'squash' + MERGE_RETRY_SLEEP: 300000 + +on: + pull_request: + types: + - labeled + - unlabeled + - synchronize + - opened + - edited + - ready_for_review + - reopened + - unlocked + check_suite: + types: + - completed + status: {} + +jobs: + automerge: + name: Automerge + runs-on: ubuntu-latest + env: + IS_POSTHOG_BOT_AVAILABLE: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN != '' }} + steps: + - name: Automerge + if: env.IS_POSTHOG_BOT_AVAILABLE == 'true' + uses: pascalgn/automerge-action@v0.14.2 + env: + GITHUB_TOKEN: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }} + - run: echo diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000000000..1eafa7b4a0f6f --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,184 @@ +name: Benchmark + +on: + pull_request: + branches: ['*'] + schedule: + - cron: '0 4 * * 1-5' # Mon-Fri 4AM UTC + workflow_dispatch: {} + +concurrency: 'benchmarks' # Ensure only one of this runs at a time + +jobs: + run-benchmarks: + name: Clickhouse queries + runs-on: ubuntu-20.04 + environment: clickhouse-benchmarks + + # Benchmarks are expensive to run so we only run them (periodically) against master branch and for PRs labeled `performance` + if: ${{ github.ref == 'refs/heads/master' || contains(github.event.pull_request.labels.*.name, 'performance') }} + + env: + SAML_DISABLED: '1' + DATABASE_URL: 'postgres://posthog:posthog@localhost:5432/posthog_test' + REDIS_URL: 'redis://localhost' + PRIMARY_DB: 'clickhouse' + DEBUG: '1' + CLICKHOUSE_DATABASE: posthog + CLICKHOUSE_HOST: ${{ secrets.BENCHMARKS_CLICKHOUSE_HOST }} + CLICKHOUSE_USER: ${{ secrets.BENCHMARKS_CLICKHOUSE_USER }} + CLICKHOUSE_PASSWORD: ${{ secrets.BENCHMARKS_CLICKHOUSE_PASSWORD }} + CLICKHOUSE_SECURE: 'false' + CLICKHOUSE_VERIFY: 'false' + SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only + + services: + postgres: + image: postgres:12 + env: + POSTGRES_USER: posthog + POSTGRES_PASSWORD: posthog + POSTGRES_DB: posthog_test + ports: + # Maps port 5432 on service container to the host + # Needed because `postgres` host is not discoverable for some reason + - 5432:5432 + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis + ports: + # Maps port 6379 on service container to the host + # Needed because `redis` host is not discoverable for some reason + - 6379:6379 + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - uses: actions/checkout@v2 + with: + # Checkout repo with full history + fetch-depth: 0 + + - name: Check out PostHog/benchmarks-results repo + uses: actions/checkout@v2 + with: + path: ee/benchmarks/results + repository: PostHog/benchmark-results + token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }} + + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - uses: syphar/restore-virtualenv@v1.2 + id: cache-benchmark-tests + + - uses: syphar/restore-pip-download-cache@v1 + if: steps.cache-benchmark-tests.outputs.cache-hit != 'true' + + - name: Install python dependencies + if: steps.cache-benchmark-tests.outputs.cache-hit != 'true' + run: | + python -m pip install -r requirements-dev.txt + python -m pip install -r requirements.txt + + - name: Install asv + run: python -m pip install pip install git+https://github.com/airspeed-velocity/asv.git virtualenv + + - name: Set up PostHog + run: | + ./bin/docker-migrate & wait + python manage.py setup_dev --no-data + + - name: Configure benchmarks + run: asv machine --config ee/benchmarks/asv.conf.json --yes --machine ci-benchmarks + + - name: Run benchmarks + run: asv run --config ee/benchmarks/asv.conf.json --show-stderr --strict + + - name: Compare results + run: | + asv compare $(cat ee/benchmarks/results/last-master-commit) HEAD --config ee/benchmarks/asv.conf.json --factor 1.2 | tee pr_vs_master.txt + asv compare $(cat ee/benchmarks/results/last-master-commit) HEAD --config ee/benchmarks/asv.conf.json --factor 1.2 --only-changed | tee pr_vs_master_changed.txt + + - name: Save last benchmarked commit + if: ${{ github.ref == 'refs/heads/master' }} + run: echo "${{ github.sha }}" | tee ee/benchmarks/results/last-master-commit + + - name: Generate HTML report of results + if: ${{ github.ref == 'refs/heads/master' }} + run: asv publish --config ee/benchmarks/asv.conf.json + + - name: Commit update for benchmark results + if: ${{ github.repository == 'PostHog/posthog' && github.ref == 'refs/heads/master' }} + uses: stefanzweifel/git-auto-commit-action@v4 + with: + repository: ee/benchmarks/results + branch: master + commit_message: 'Save benchmark results' + commit_user_name: PostHog Bot + commit_user_email: hey@posthog.com + commit_author: PostHog Bot + + - name: Upload results as artifacts + uses: actions/upload-artifact@v2 + with: + name: benchmarks + path: | + pr_vs_master.txt + pr_vs_master_changed.txt + + - name: Read benchmark output + if: ${{ github.event_name == 'pull_request' }} + id: pr_vs_master_changed + uses: juliangruber/read-file-action@v1.0.0 + with: + path: pr_vs_master_changed.txt + + - name: Read benchmark output (full) + if: ${{ github.event_name == 'pull_request' }} + id: pr_vs_master + uses: juliangruber/read-file-action@v1.0.0 + with: + path: pr_vs_master.txt + + - name: Find Comment + if: ${{ github.event_name == 'pull_request' }} + uses: peter-evans/find-comment@v1 + id: fc + with: + issue-number: ${{ github.event.number }} + comment-author: 'github-actions[bot]' + body-includes: ClickHouse query benchmark results from GitHub Actions + + - name: Create or update comment + if: ${{ github.event_name == 'pull_request' }} + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ steps.fc.outputs.comment-id }} + issue-number: ${{ github.event.number }} + body: | + ClickHouse query benchmark results from GitHub Actions + + Lower numbers are good, higher numbers are bad. A ratio less than 1 + means a speed up and greater than 1 means a slowdown. Green lines + beginning with `+` are slowdowns (the PR is slower then master or + master is slower than the previous release). Red lines beginning + with `-` are speedups. Blank means no changes. + + Significantly changed benchmark results (PR vs master) + ```diff + ${{ steps.pr_vs_master_changed.outputs.content }} + ``` + +
+ Click to view full benchmark results + + ```diff + ${{ steps.pr_vs_master.outputs.content }} + ``` +
+ edit-mode: replace diff --git a/.github/workflows/ci-backend.yml b/.github/workflows/ci-backend.yml index fe46dc11fe3d8..c6bc9f897ea3b 100644 --- a/.github/workflows/ci-backend.yml +++ b/.github/workflows/ci-backend.yml @@ -1,10 +1,58 @@ name: Backend CI on: - - pull_request + push: + branches: + - master + pull_request: +env: + SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only + DATABASE_URL: 'postgres://posthog:posthog@localhost:5432/posthog' + REDIS_URL: 'redis://localhost' + CLICKHOUSE_HOST: 'localhost' + CLICKHOUSE_SECURE: 'False' + CLICKHOUSE_VERIFY: 'False' + SAML_DISABLED: 1 + TEST: 1 jobs: - code-quality: + # Job to decide if we should run backend ci + # See https://github.com/dorny/paths-filter#conditional-execution for more details + changes: + runs-on: ubuntu-latest + if: github.repository == 'PostHog/posthog' + name: Determine need to run backend checks + # Set job outputs to values from filter step + outputs: + backend: ${{ steps.filter.outputs.backend }} + steps: + # For pull requests it's not necessary to checkout the code, but we + # also want this to run on master so we need to checkout + - uses: actions/checkout@v2 + + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + backend: + # Avoid running backend tests for irrelevant changes + # NOTE: we are at risk of missing a dependency here. We could make + # the dependencies more clear if we separated the backend/frontend + # code completely + - 'ee/**/*' + - 'posthog/**/*' + - requirements.txt + - requirements-dev.txt + - mypy.ini + - pytest.ini + # Make sure we run if someone is explicitly change the workflow + - .github/workflows/ci-backend.yml + + backend-code-quality: + needs: changes + # Make sure we only run on backend changes + if: ${{ needs.changes.outputs.backend == 'true' && github.repository == 'PostHog/posthog' }} + name: Code quality checks runs-on: ubuntu-latest @@ -12,9 +60,9 @@ jobs: postgres: image: postgres:12 env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: postgres + POSTGRES_USER: posthog + POSTGRES_PASSWORD: posthog + POSTGRES_DB: posthog ports: ['5432:5432'] options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 @@ -28,19 +76,19 @@ jobs: with: python-version: 3.8 - - uses: actions/cache@v1 + - uses: syphar/restore-virtualenv@v1.2 + id: cache-backend-tests with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt', '**/requirements-dev.txt') }} - restore-keys: | - ${{ runner.os }}-pip- + custom_cache_key_element: v1- + + - uses: syphar/restore-pip-download-cache@v1 + if: steps.cache-backend-tests.outputs.cache-hit != 'true' - - name: Install Python dependencies with pip + - name: Install python dependencies + if: steps.cache-backend-tests.outputs.cache-hit != 'true' run: | - python -m pip install -U pip - python -m pip install -r requirements.txt python -m pip install -r requirements-dev.txt - if: steps.cache.outputs.cache-hit != 'true' + python -m pip install -r requirements.txt - name: Check formatting run: | @@ -55,160 +103,197 @@ jobs: flake8 . --count --exit-zero --max-complexity=10 --max-line-length=120 --statistics - name: Typecheck - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' run: | mypy . django: - name: Django tests – Py ${{ matrix.python-version }} + needs: changes + if: ${{ needs.changes.outputs.backend == 'true' && github.repository == 'PostHog/posthog' }} + + name: Django tests – Py ${{ matrix.python-version }} ${{ matrix.name }} (${{matrix.group}}/${{ matrix.concurrency }}) runs-on: ubuntu-latest + strategy: fail-fast: false matrix: - python-version: ['3.7.8', '3.8.5', '3.9.0'] - services: - postgres: - image: postgres:12 - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: postgres - ports: ['5432:5432'] - options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 - clickhouse-server: - image: yandex/clickhouse-server - ports: - - '8123:8123' - - '9000:9000' - - '9440:9440' - - '9009:9009' + python-version: ['3.8.5'] + ee: [true] + foss: [false] + saml: [false] + name: [''] + # :NOTE: Keep concurrency and group's in sync + concurrency: [5] + group: [1, 2, 3, 4, 5] + include: + # :TRICKY: Run FOSS tests in a separate container + - python-version: '3.8.5' + ee: false + saml: false + foss: true + name: 'FOSS' + concurrency: 1 + group: 1 + # :TRICKY: Run FOSS tests in a separate container + - python-version: '3.9.0' + ee: false + saml: false + foss: true + name: 'FOSS' + concurrency: 1 + group: 1 + # :TRICKY: Run SAML tests in a separate container + - python-version: '3.8.5' + ee: false + saml: true + foss: false + name: 'SAML' + concurrency: 1 + group: 1 steps: - uses: actions/checkout@v1 with: fetch-depth: 1 + - name: Start stack with Docker Compose + run: docker-compose -f ee/docker-compose.ch.yml up -d ${{ matrix.foss && 'db' || 'db clickhouse zookeeper kafka' }} + - name: Set up Python uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - uses: actions/cache@v1 + - name: Install SAML (python3-saml) dependencies (not required for Cloud or FOSS) + if: ${{ matrix.saml }} + run: sudo apt-get install libxml2-dev libxmlsec1-dev libxmlsec1-openssl + + - uses: syphar/restore-virtualenv@v1.2 + id: cache-backend-tests with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip-${{ matrix.python-version }}- + custom_cache_key_element: v1-${{ matrix.name }} - - name: Install requirements.txt dependencies with pip + - uses: syphar/restore-pip-download-cache@v1 + if: steps.cache-backend-tests.outputs.cache-hit != 'true' + + - name: Install python dependencies + if: steps.cache-backend-tests.outputs.cache-hit != 'true' run: | - python -m pip install --upgrade pip + python -m pip install -r requirements-dev.txt python -m pip install -r requirements.txt - python -m pip install freezegun fakeredis - if: steps.cache.outputs.cache-hit != 'true' + + - name: Install SAML python dependencies + if: ${{ matrix.saml }} + run: | + python -m pip install python3-saml==1.12.0 - name: Check migrations - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' - run: python manage.py makemigrations --check --dry-run + run: | + python manage.py makemigrations --check --dry-run + git fetch origin master + # `git diff --name-only` returns a list of files that were changed - added OR deleted OR modified + # With `--name-status` we get the same, but including a column for status, respectively: A, D, M + # In this check we exclusively care about files that were added (A) in posthog/migrations/ + git diff --name-status origin/master..HEAD | grep "A\tposthog/migrations/" | awk '{print $2}' | python manage.py test_migrations_are_null - - name: Run posthog tests - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' + - name: Add kafka host to /etc/hosts for kafka connectivity + run: sudo echo "127.0.0.1 kafka" | sudo tee -a /etc/hosts + + - name: Set up needed files run: | mkdir -p frontend/dist touch frontend/dist/index.html touch frontend/dist/layout.html touch frontend/dist/shared_dashboard.html - python manage.py test posthog -v 2 - - name: Run EE tests + + - name: Run FOSS tests + if: ${{ matrix.foss }} + run: | + rm -rf ee/ + pytest -m "not ee" posthog/ --cov --cov-report=xml:coverage-postgres.xml + + - name: Run SAML tests + if: ${{ matrix.saml }} env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' PRIMARY_DB: 'clickhouse' - CLICKHOUSE_HOST: 'localhost' - CLICKHOUSE_DATABASE: 'posthog_test' - CLICKHOUSE_SECURE: 'False' - CLICKHOUSE_VERIFY: 'False' run: | - mkdir -p frontend/dist - touch frontend/dist/index.html - touch frontend/dist/layout.html - touch frontend/dist/shared_dashboard.html - python manage.py test ee --testrunner="ee.clickhouse.clickhouse_test_runner.ClickhouseTestRunner" + pytest ee -m "saml_only" + + - name: Run ee/ tests + if: ${{ matrix.ee }} + env: + PRIMARY_DB: 'clickhouse' + run: | + pytest ee -m "not saml_only" \ + --splits ${{ matrix.concurrency }} \ + --group ${{ matrix.group }} \ + --store-durations \ + --cov \ + --cov-report=xml:coverage-clickhouse.xml + + - name: Run pytest.mark.ee tests + if: ${{ matrix.ee && matrix.group == '1' }} + env: + PRIMARY_DB: 'clickhouse' + run: | + pytest posthog -m "ee" + + - uses: codecov/codecov-action@v2 + if: ${{ !matrix.saml && !matrix.cloud }} + with: + files: ./coverage-postgres.xml,./coverage-clickhouse.xml + fail_ci_if_error: false + verbose: true + + - name: Upload updated timing data as artifacts + uses: actions/upload-artifact@v2 + if: ${{ matrix.ee }} + with: + name: timing_data-${{ matrix.group }} + path: .test_durations cloud: + needs: changes + if: ${{ needs.changes.outputs.backend == 'true' && github.repository == 'PostHog/posthog' }} + name: Django tests – Cloud runs-on: ubuntu-latest - - services: - postgres: - image: postgres:12 - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: postgres - ports: ['5432:5432'] - options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 - clickhouse-server: - image: yandex/clickhouse-server - ports: - - '8123:8123' - - '9000:9000' - - '9440:9440' - - '9009:9009' - steps: - - name: Fetch posthog-production + - name: Fetch posthog-cloud run: | - curl -L https://github.com/posthog/posthog-production/tarball/master | tar --strip-components=1 -xz -- + curl -L https://github.com/posthog/posthog-cloud/tarball/master | tar --strip-components=1 -xz -- mkdir deploy/ - - name: Checkout master uses: actions/checkout@v2 with: ref: 'master' path: 'deploy/' - - - name: Link posthog-production at master + - name: Link posthog-cloud at master run: | cp -r multi_tenancy deploy/ cp -r messaging deploy/ cat multi_tenancy_settings.py >> deploy/posthog/settings.py cat requirements.txt >> deploy/requirements.txt - + - name: Start stack with Docker Compose + run: docker-compose -f deploy/ee/docker-compose.ch.yml up -d db clickhouse zookeeper kafka - name: Set up Python 3.8 uses: actions/setup-python@v2 with: python-version: 3.8 - - uses: actions/cache@v2 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- + - uses: syphar/restore-virtualenv@v1.2 + id: cache-backend-tests + + - uses: syphar/restore-pip-download-cache@v1 + if: steps.cache-backend-tests.outputs.cache-hit != 'true' - - name: Install requirements.txt dependencies with pip + - name: Install python dependencies + if: steps.cache-backend-tests.outputs.cache-hit != 'true' run: | - cd deploy - python -m pip install --upgrade pip - python -m pip install -r requirements.txt - python -m pip install freezegun fakeredis - if: steps.cache.outputs.cache-hit != 'true' + python -m pip install -r deploy/requirements-dev.txt + python -m pip install -r deploy/requirements.txt # The 2-step migration process (first master, then current branch) verifies that it'll always # be possible to migrate to the new version without problems in production - name: Migrate initially at master, then remove master deploy code - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' run: | python deploy/manage.py migrate rm -rf deploy @@ -218,119 +303,43 @@ jobs: with: path: 'deploy/' - - name: Link posthog-production at current branch + - name: Install requirements.txt dependencies with pip at current branch run: | + cd deploy + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + python -m pip install freezegun fakeredis pytest pytest-mock pytest-django syrupy + + - name: Link posthog-cloud at current branch + run: | + cp deploy/ee/conftest.py multi_tenancy/conftest.py + cp deploy/ee/conftest.py messaging/conftest.py cp -r multi_tenancy deploy/ cp -r messaging deploy/ cat multi_tenancy_settings.py >> deploy/posthog/settings.py cat requirements.txt >> deploy/requirements.txt - name: Check migrations - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' run: | cd deploy python manage.py makemigrations --check --dry-run python manage.py migrate - - name: Run posthog tests - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' + - name: Add kafka host to /etc/hosts for kafka connectivity + run: sudo echo "127.0.0.1 kafka" | sudo tee -a /etc/hosts + + - name: Set up needed files run: | cd deploy mkdir -p frontend/dist touch frontend/dist/index.html touch frontend/dist/layout.html touch frontend/dist/shared_dashboard.html - python manage.py test posthog --keepdb -v 2 --exclude-tag=skip_on_multitenancy - - name: Run cloud tests (posthog-production) + - name: Run cloud tests (posthog-cloud) env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' PRIMARY_DB: 'clickhouse' - CLICKHOUSE_HOST: 'localhost' - CLICKHOUSE_DATABASE: 'posthog_test' - CLICKHOUSE_SECURE: 'False' - CLICKHOUSE_VERIFY: 'False' run: | + source .env.template cd deploy - python manage.py test multi_tenancy messaging --keepdb -v 2 --exclude-tag=skip_on_multitenancy - - name: Run EE tests - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' - PRIMARY_DB: 'clickhouse' - CLICKHOUSE_HOST: 'localhost' - CLICKHOUSE_DATABASE: 'posthog_test' - CLICKHOUSE_SECURE: 'False' - CLICKHOUSE_VERIFY: 'False' - run: | - cd deploy - python manage.py test ee --keepdb - - foss: - name: Django tests – FOSS - runs-on: ubuntu-latest - - services: - postgres: - image: postgres:12 - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: postgres - ports: ['5432:5432'] - options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 - - steps: - - uses: actions/checkout@v1 - with: - fetch-depth: 1 - - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - - uses: actions/cache@v1 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - - - name: Install requirements.txt dependencies with pip - run: | - python -m pip install --upgrade pip - python -m pip install -r requirements.txt - python -m pip install freezegun fakeredis - if: steps.cache.outputs.cache-hit != 'true' - - - name: Remove ee - run: | - rm -rf ee/ - - - name: Check migrations - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - REDIS_URL: 'redis://localhost' - run: python manage.py makemigrations --check --dry-run - - - name: Run tests - env: - SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only - DATABASE_URL: 'postgres://postgres:postgres@localhost:${{ job.services.postgres.ports[5432] }}/postgres' - run: | - mkdir -p frontend/dist - touch frontend/dist/index.html - touch frontend/dist/layout.html - touch frontend/dist/shared_dashboard.html - python manage.py test -v 2 --exclude-tag=ee + pytest multi_tenancy messaging -m "not skip_on_multitenancy" diff --git a/.github/workflows/ci-frontend.yml b/.github/workflows/ci-frontend.yml index c3446c1931eac..27240e455f0df 100644 --- a/.github/workflows/ci-frontend.yml +++ b/.github/workflows/ci-frontend.yml @@ -1,12 +1,14 @@ name: Frontend CI on: - - pull_request + pull_request: + paths-ignore: + - 'plugin-server/**' jobs: - code-quality: + frontend-code-quality: name: Code quality checks - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v1 @@ -15,11 +17,53 @@ jobs: with: node-version: 14 + - uses: actions/cache@v2 + id: node-modules-cache + with: + path: | + node_modules + key: ${{ runner.os }}-node-modules-${{ hashFiles('**/yarn.lock') }} + restore-keys: | + ${{ runner.os }}-node-modules + - name: Install package.json dependencies with Yarn - run: yarn + if: steps.node-modules-cache.outputs.cache-hit != 'true' + run: yarn install --frozen-lockfile - name: Check formatting with prettier run: yarn prettier:check - name: Lint with ESLint run: yarn eslint + + - name: Run typescript with strict + run: | + ./bin/check-typescript-strict + + jest: + name: Jest tests + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v1 + + - name: Set up Node 14 + uses: actions/setup-node@v1 + with: + node-version: 14 + + - uses: actions/cache@v2 + id: node-modules-cache + with: + path: | + node_modules + key: ${{ runner.os }}-node-modules-${{ hashFiles('**/yarn.lock') }} + restore-keys: | + ${{ runner.os }}-node-modules + + - name: Install package.json dependencies with Yarn + if: steps.node-modules-cache.outputs.cache-hit != 'true' + run: yarn install --frozen-lockfile + + - name: Test with Jest + run: yarn test diff --git a/.github/workflows/cypress-component.yml b/.github/workflows/cypress-component.yml new file mode 100644 index 0000000000000..9972d6e12a539 --- /dev/null +++ b/.github/workflows/cypress-component.yml @@ -0,0 +1,61 @@ +name: Cypress Component + +on: + pull_request: + paths-ignore: + - 'plugin-server/**' + +jobs: + cypress-component: + name: Cypress component tests + runs-on: ubuntu-18.04 + + steps: + - name: Checkout + uses: actions/checkout@v1 + - uses: actions/setup-node@v1 + with: + node-version: 14 + - uses: actions/cache@v2 + id: cypress-node-modules-cache-2 + with: + path: | + **/node_modules + ~/.cache/Cypress + key: ${{ runner.os }}-cypress-node-modules-2-${{ hashFiles('**/yarn.lock') }} + restore-keys: | + ${{ runner.os }}-cypress-node-modules-2 + - name: Yarn install deps + if: steps.cypress-node-modules-cache-2.outputs.cache-hit != 'true' + run: | + yarn install --frozen-lockfile + yarn add cypress@6.7.0 cypress-terminal-report@2.1.0 @cypress/react@4.16.4 @cypress/webpack-preprocessor@5.7.0 + - name: Yarn build + env: + GENERATE_SOURCEMAP: 'false' + run: | + yarn build + - name: Cypress run + uses: cypress-io/github-action@v2 + with: + config-file: cypress.json + record: true + parallel: true + group: 'PostHog Component' + # We're already installing cypress above + # We have to install it separately otherwise the tests fail. + install: false + # We already install cypress separately, we don't need to install it again here + install-command: echo "no" + env: + # pass the Dashboard record key as an environment variable + CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }} + # Recommended: pass the GitHub token lets this action correctly + # determine the unique run id necessary to re-run the checks + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Archive test screenshots + uses: actions/upload-artifact@v1 + with: + name: screenshots + path: cypress/screenshots + if: ${{ failure() }} diff --git a/.github/workflows/docker-image-publish.yml b/.github/workflows/docker-image-publish.yml new file mode 100644 index 0000000000000..ba312103c9291 --- /dev/null +++ b/.github/workflows/docker-image-publish.yml @@ -0,0 +1,59 @@ +name: Docker + +on: + push: + branches: + - master + - main + +jobs: + build-push: + name: Build Docker images and push them + if: github.repository == 'PostHog/posthog' + runs-on: ubuntu-20.04 + steps: + - name: Checkout default branch + uses: actions/checkout@v2 + + - name: Update git sha + run: echo "GIT_SHA = '${GITHUB_SHA}'" >posthog/gitsha.py + + - name: Set up QEMU + if: github.repository == 'PostHog/posthog' + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + if: github.repository == 'PostHog/posthog' + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + if: github.repository == 'PostHog/posthog' + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push latest + id: docker-latest + if: github.repository == 'PostHog/posthog' + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: posthog/posthog:latest + + - name: Build and push dev + id: docker-dev + if: github.repository == 'PostHog/posthog' + uses: docker/build-push-action@v2 + with: + context: . + file: dev.Dockerfile + push: true + tags: posthog/posthog:dev + + - name: Image digests + if: github.repository == 'PostHog/posthog' + run: | + echo "Pushed latest: ${{ steps.docker-latest.outputs.digest }}" + echo "Pushed dev: ${{ steps.docker-dev.outputs.digest }}" diff --git a/.github/workflows/docker-image-test.yml b/.github/workflows/docker-image-test.yml new file mode 100644 index 0000000000000..bda28fa429741 --- /dev/null +++ b/.github/workflows/docker-image-test.yml @@ -0,0 +1,49 @@ +name: Docker + +on: + - pull_request + +jobs: + build: + name: Test image build + runs-on: ubuntu-20.04 + + steps: + - name: Checkout PR branch + uses: actions/checkout@v2 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Build + id: docker_build + uses: docker/build-push-action@v2 + with: + push: false + tags: posthog/posthog:testing + + saml: + name: Test image build (without SAML dependencies) + runs-on: ubuntu-20.04 + + steps: + - name: Checkout PR branch + uses: actions/checkout@v2 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Build + id: docker_build_no_saml + uses: docker/build-push-action@v2 + with: + push: false + tags: posthog/posthog:testing-no-saml + build-args: | + saml_disabled=1 diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml deleted file mode 100644 index 7265ddebac47d..0000000000000 --- a/.github/workflows/docker-image.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Docker - -on: - - pull_request - -jobs: - build: - name: Test building of Docker image - runs-on: ubuntu-20.04 - steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Build and push Docker images - uses: docker/build-push-action@v1 - with: - cache_froms: posthog/posthog:latest - dockerfile: production.Dockerfile - repository: posthog/posthog - push: false - tags: latest diff --git a/.github/workflows/docker-release-image-publish.yml b/.github/workflows/docker-release-image-publish.yml new file mode 100644 index 0000000000000..a4518cfacd114 --- /dev/null +++ b/.github/workflows/docker-release-image-publish.yml @@ -0,0 +1,53 @@ +# Generates the `posthog/posthog:latest-release` & `posthog/posthog:release-[version]` Docker images +# and pushes to Docker Hub +name: Docker release image + +on: + push: + tags: + - '*.**' + +jobs: + build-push: + name: Build & push Docker release image + runs-on: ubuntu-20.04 + steps: + - name: Checkout default branch + uses: actions/checkout@v2 + + - name: Get tag name + run: echo "TAG_NAME=$(echo ${GITHUB_REF#refs/tags/} | tr / -)" >> $GITHUB_ENV + + - name: Update git SHA + run: echo "GIT_SHA = '${GITHUB_SHA}'" > posthog/gitsha.py + + - name: Set up QEMU + if: github.repository == 'PostHog/posthog' + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + if: github.repository == 'PostHog/posthog' + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + if: github.repository == 'PostHog/posthog' + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push release + id: docker-release + if: github.repository == 'PostHog/posthog' + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: | + posthog/posthog:latest-release + posthog/posthog:release-${{ env.TAG_NAME }} + + - name: Image digest + if: github.repository == 'PostHog/posthog' + run: | + echo "Pushed release: ${{ steps.docker-release.outputs.digest }}" diff --git a/.github/workflows/docker-unstable-image.yml b/.github/workflows/docker-unstable-image.yml new file mode 100644 index 0000000000000..813a95b0a3eab --- /dev/null +++ b/.github/workflows/docker-unstable-image.yml @@ -0,0 +1,52 @@ +# Generates the `posthog/posthog:release-[version]-unstable` Docker image and pushes to Docker Hub +# when a branch that matches `release-[version]` is pushed. Image can be used for break the release sessions. +name: Docker unstable image for code freeze + +on: + push: + branches: + - 'release-*.*' + +jobs: + build-release-push: + name: Build & push Docker release image + if: github.repository == 'PostHog/posthog' + runs-on: ubuntu-20.04 + steps: + - name: Checkout default branch + uses: actions/checkout@v2 + + - name: Get branch name + run: echo "BRANCH_NAME=$(echo ${GITHUB_REF#refs/heads/} | tr / -)" >> $GITHUB_ENV + + - name: Update git SHA + run: echo "GIT_SHA = '${GITHUB_SHA}'" > posthog/gitsha.py + + - name: Set up QEMU + if: github.repository == 'PostHog/posthog' + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + if: github.repository == 'PostHog/posthog' + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + if: github.repository == 'PostHog/posthog' + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push release + id: docker-release + if: github.repository == 'PostHog/posthog' + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: posthog/posthog:${{ env.BRANCH_NAME }}-unstable + + - name: Image digest + if: github.repository == 'PostHog/posthog' + run: | + echo "Pushed release: ${{ steps.docker-release.outputs.digest }}" diff --git a/.github/workflows/dockerfile-lint.yml b/.github/workflows/dockerfile-lint.yml new file mode 100644 index 0000000000000..3221ad7082bb1 --- /dev/null +++ b/.github/workflows/dockerfile-lint.yml @@ -0,0 +1,26 @@ +name: Dockerfile + +on: + - pull_request + +jobs: + build: + name: Lint + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Check if any Dockerfile has changed + id: changed-files + uses: tj-actions/changed-files@v9.3 + with: + files: | + **Dockerfile + separator: ' ' + + - name: Lint changed Dockerfile(s) with Hadolint + uses: jbergstroem/hadolint-gh-action@v1 + if: steps.changed-files.outputs.any_changed == 'true' + with: + dockerfile: '${{ steps.changed-files.outputs.all_modified_files }}' diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 20580e76e97bd..8b0ef29b5cde9 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -4,18 +4,36 @@ on: - pull_request jobs: + cypress_prep: + name: Cypress E2E preparation + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Check out + uses: actions/checkout@v2 + - id: set-matrix + run: | + if ${{github.event.pull_request.head.repo.full_name == github.repository}}; then + matrix=$(jq 'map(.)' .github/workflows/e2e_matrix.json) + else + matrix='[{"containers": [1]}]' + fi + echo ::set-output name=matrix::{\"include\":$(echo $matrix)}\" + cypress: - name: Cypress tests + name: Cypress E2E tests runs-on: ubuntu-18.04 + needs: cypress_prep + strategy: # when one test fails, DO NOT cancel the other # containers, because this will kill Cypress processes # leaving the Dashboard hanging ... # https://github.com/cypress-io/github-action/issues/48 fail-fast: false - matrix: - # run 7 copies of the current job in parallel - containers: [1, 2, 3, 4, 5, 6, 7] + matrix: ${{fromJson(needs.cypress_prep.outputs.matrix)}} + services: postgres: image: postgres:12 @@ -41,20 +59,20 @@ jobs: --health-retries 5 steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 - name: Set up Python 3.8 uses: actions/setup-python@v2 with: python-version: 3.8 - - uses: actions/cache@v1 - name: Cache pip dependencies - id: pip-cache + + - uses: syphar/restore-virtualenv@v1.2 + id: cache-virtualenv with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- + requirement_files: requirements.txt # this is optional + - uses: syphar/restore-pip-download-cache@v1 + if: steps.cache-virtualenv.outputs.cache-hit != 'true' - name: Install python dependencies + if: steps.cache-virtualenv.outputs.cache-hit != 'true' run: | python -m pip install --upgrade pip python -m pip install $(grep -ivE "psycopg2" requirements.txt | cut -d'#' -f1) --no-cache-dir --compile @@ -62,33 +80,25 @@ jobs: - uses: actions/setup-node@v1 with: node-version: 14 - - name: Get yarn cache directory path - id: yarn-dep-cache-dir-path - run: echo "::set-output name=dir::$(yarn cache dir)" - - uses: actions/cache@v1 - name: Setup Yarn dep cache - id: yarn-dep-cache + - uses: actions/cache@v2 + id: cypress-node-modules-cache-2 with: - path: ${{ steps.yarn-dep-cache-dir-path.outputs.dir }} - key: ${{ runner.os }}-yarn-dep-${{ hashFiles('**/yarn.lock') }} + path: | + **/node_modules + ~/.cache/Cypress + key: ${{ runner.os }}-cypress-node-modules-2-${{ hashFiles('**/yarn.lock') }} restore-keys: | - ${{ runner.os }}-yarn-dep- + ${{ runner.os }}-cypress-node-modules-2 - name: Yarn install deps + if: steps.cypress-node-modules-cache-2.outputs.cache-hit != 'true' run: | yarn install --frozen-lockfile - yarn add cypress@5.3.0 cypress-terminal-report@2.1.0 - - uses: actions/cache@v1 - name: Setup Yarn build cache - id: yarn-build-cache - with: - path: frontend/dist - key: ${{ runner.os }}-yarn-build-${{ hashFiles('frontend/src/') }} - restore-keys: | - ${{ runner.os }}-yarn-build- + yarn add cypress@6.7.0 cypress-terminal-report@2.1.0 @cypress/react@4.16.4 @cypress/webpack-preprocessor@5.7.0 - name: Yarn build + env: + GENERATE_SOURCEMAP: 'false' run: | yarn build - if: steps.yarn-build-cache.outputs.cache-hit != 'true' - name: Boot PostHog env: SECRET_KEY: '6b01eee4f945ca25045b5aab440b953461faf08693a9abbf1166dc7c6b9772da' # unsafe - for testing only @@ -97,20 +107,29 @@ jobs: DISABLE_SECURE_SSL_REDIRECT: 1 SECURE_COOKIES: 0 OPT_OUT_CAPTURE: 1 - SELF_CAPTURE: 1 + SELF_CAPTURE: 0 + E2E_TESTING: 1 + EMAIL_HOST: 'email.test.posthog.net' # used to test password resets + SITE_URL: 'test.posthog.net' # used to test password resets run: | - python manage.py collectstatic --noinput + python manage.py collectstatic --noinput & + ./bin/docker-migrate & wait + python manage.py setup_dev mkdir -p cypress/screenshots - ./bin/docker-migrate ./bin/docker-worker & ./bin/docker-server & - name: Cypress run uses: cypress-io/github-action@v2 with: - config-file: cypress.json + config-file: cypress.e2e.json record: true - parallel: true + parallel: ${{github.event.pull_request.head.repo.full_name == github.repository}} group: 'PostHog Frontend' + # We're already installing cypress above + # We have to install it separately otherwise the tests fail. + install: false + # We already install cypress separately, we don't need to install it again here + install-command: echo "no" env: # pass the Dashboard record key as an environment variable CYPRESS_RECORD_KEY: ${{ secrets.CYPRESS_RECORD_KEY }} diff --git a/.github/workflows/e2e_matrix.json b/.github/workflows/e2e_matrix.json new file mode 100644 index 0000000000000..e02d8d4b3c407 --- /dev/null +++ b/.github/workflows/e2e_matrix.json @@ -0,0 +1,14 @@ +[ + { + "container": 1 + }, + { + "container": 2 + }, + { + "container": 3 + }, + { + "container": 4 + } +] diff --git a/.github/workflows/foss-release-image-publish.yml b/.github/workflows/foss-release-image-publish.yml new file mode 100644 index 0000000000000..026b0ea60b653 --- /dev/null +++ b/.github/workflows/foss-release-image-publish.yml @@ -0,0 +1,57 @@ +# Generates the `posthog/posthog:foss-latest-release` & `posthog/posthog:foss-release-[version]` Docker images +# and pushes to Docker Hub + +name: Docker FOSS release image + +on: + push: + tags: + - '*.**' + +jobs: + build-push: + name: Build & push Docker release image + runs-on: ubuntu-20.04 + steps: + - name: Checkout default branch + if: github.repository == 'PostHog/posthog-foss' + uses: actions/checkout@v2 + + - name: Get tag name + if: github.repository == 'PostHog/posthog-foss' + run: echo "TAG_NAME=$(echo ${GITHUB_REF#refs/tags/} | tr / -)" >> $GITHUB_ENV + + - name: Update git SHA + if: github.repository == 'PostHog/posthog-foss' + run: echo "GIT_SHA = '${GITHUB_SHA}'" > posthog/gitsha.py + + - name: Set up QEMU + if: github.repository == 'PostHog/posthog-foss' + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + if: github.repository == 'PostHog/posthog-foss' + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + if: github.repository == 'PostHog/posthog-foss' + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push release + if: github.repository == 'PostHog/posthog-foss' + id: docker-release + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: | + posthog/posthog-foss:latest-release + posthog/posthog-foss:release-${{ env.TAG_NAME }} + + - name: Image digest + if: github.repository == 'PostHog/posthog-foss' + run: | + echo "Pushed release: ${{ steps.docker-release.outputs.digest }}" diff --git a/.github/workflows/foss-sync.yml b/.github/workflows/foss-sync.yml new file mode 100644 index 0000000000000..d4fc537b3e9cc --- /dev/null +++ b/.github/workflows/foss-sync.yml @@ -0,0 +1,51 @@ +name: Sync PostHog FOSS + +on: + push: + branches: + - master + - main + +jobs: + repo-sync: + name: Sync posthog-foss with posthog + if: github.repository == 'PostHog/posthog' + runs-on: ubuntu-latest + steps: + - name: Sync repositories 1 to 1 - master branch + uses: wei/git-sync@v3 + with: + source_repo: 'https://posthog-bot:${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }}@github.com/posthog/posthog.git' + source_branch: 'master' + destination_repo: 'https://posthog-bot:${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }}@github.com/posthog/posthog-foss.git' + destination_branch: 'master' + - name: Sync repositories 1 to 1 – tags + uses: wei/git-sync@v3 + with: + source_repo: 'https://posthog-bot:${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }}@github.com/posthog/posthog.git' + source_branch: 'refs/tags/*' + destination_repo: 'https://posthog-bot:${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }}@github.com/posthog/posthog-foss.git' + destination_branch: 'refs/tags/*' + - name: Checkout posthog-foss + uses: actions/checkout@v2 + with: + repository: 'posthog/posthog-foss' + ref: master + token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }} + - name: Change LICENSE to pure MIT + run: | + sed -i -e '/PostHog Inc\./,/Permission is hereby granted/c\Copyright (c) 2020-2021 PostHog Inc\.\n\nPermission is hereby granted, free of charge, to any person obtaining a copy' LICENSE + echo -e "MIT License\n\n$(cat LICENSE)" > LICENSE + - name: Remove unused GitHub workflows + run: | + cd .github/workflows + ls | grep -v foss-release-image-publish.yml | xargs rm + + - name: Commit "Sync and remove all non-FOSS parts" + uses: EndBug/add-and-commit@v7 + with: + message: 'Sync and remove all non-FOSS parts' + remove: '["-r ee/"]' + default_author: github_actions + github_token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }} + - run: echo # Empty step so that GitHub doesn't complain about an empty job on forks diff --git a/.github/workflows/new-pr.yml b/.github/workflows/new-pr.yml new file mode 100644 index 0000000000000..04570a21ec253 --- /dev/null +++ b/.github/workflows/new-pr.yml @@ -0,0 +1,34 @@ +name: New PR + +on: + pull_request: + types: [opened] + +jobs: + check-description: + name: Check that PR has description + runs-on: ubuntu-20.04 + + steps: + - name: Check if PR is shame-worthy + id: is-shame-worthy + run: | + FILTERED_BODY=$( \ + sed -r -e \ + '/^(\.\.\.)|(\*Please describe)|(\*If this affects the frontend, include screenshots)|(##? )|(- *\[)/d' \ + <<< $RAW_BODY \ + ) + echo "::debug::Filtered PR body to $FILTERED_BODY" + if [[ -z "${FILTERED_BODY//[[:space:]]/}" ]]; then + echo "::set-output name=is-shame-worthy::true" + else + echo "::set-output name=is-shame-worthy::false" + fi + env: + RAW_BODY: ${{ github.event.pull_request.body }} + + - name: Shame if PR has no description + if: steps.is-shame-worthy.outputs.is-shame-worthy == 'true' + run: | + SHAME_BODY="Hey @${{ github.actor }}! 👋\nThis pull request seems to contain no description. Please add useful context, rationale, and/or any other information that will help make sense of this change now and in the distant Mars-based future." + curl -s -u posthog-bot:${{ secrets.POSTHOG_BOT_GITHUB_TOKEN || secrets.GITHUB_TOKEN }} -X POST -d "{ \"body\": \"$SHAME_BODY\" }" "https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments" diff --git a/.github/workflows/plugin-server-ci.yml b/.github/workflows/plugin-server-ci.yml new file mode 100644 index 0000000000000..c23ed975e21a8 --- /dev/null +++ b/.github/workflows/plugin-server-ci.yml @@ -0,0 +1,334 @@ +name: Plugin Server CI + +on: + pull_request: + paths: + - 'plugin-server/**' + - 'ee/clickhouse/migrations/**' + - 'ee/migrations/**' + - 'posthog/migrations/**' + - 'posthog/plugins/**' + - 'docker*.yml' + +jobs: + code-quality: + name: Code quality + runs-on: ubuntu-20.04 + defaults: + run: + working-directory: 'plugin-server' + + steps: + - uses: actions/checkout@v1 + + - name: Set up Node 14 + uses: actions/setup-node@v1 + with: + node-version: 14 + + - name: Install package.json dependencies with Yarn + run: yarn + + - name: Check formatting with prettier + run: yarn prettier:check + + - name: Lint with ESLint + run: yarn lint + + tests-postgres-1: + name: Plugin Server Tests / Postgres + Redis (1) + runs-on: ubuntu-20.04 + + services: + postgres: + image: postgres:12 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_posthog + ports: ['5432:5432'] + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis + ports: + - '6379:6379' + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + REDIS_URL: 'redis://localhost' + + steps: + - name: Check out Django server for database setup + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install SAML (python3-saml) dependencies + run: sudo apt-get install libxml2-dev libxmlsec1-dev libxmlsec1-openssl + + - name: Set up Node 14 + uses: actions/setup-node@v2 + with: + node-version: 14 + + - uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-v1-${{ hashFiles('posthog/requirements.txt') }} + + - name: Install requirements.txt dependencies with pip + run: | + pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements.txt + - name: Set up databases + env: + SECRET_KEY: 'abcdef' # unsafe - for testing only + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/posthog' + TEST: 'true' + run: python manage.py setup_test_environment + + - name: Install package.json dependencies with Yarn + run: cd plugin-server && yarn + + - name: Test with Jest + env: + # Below DB name has `test_` prepended, as that's how Django (ran above) creates the test DB + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/test_posthog' + REDIS_URL: 'redis://localhost' + run: cd plugin-server && yarn typescript:check && yarn test:postgres:1 + + tests-postgres-2: + name: Plugin Server Tests / Postgres + Redis (2) + runs-on: ubuntu-20.04 + + services: + postgres: + image: postgres:12 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_posthog + ports: ['5432:5432'] + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis + ports: + - '6379:6379' + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + REDIS_URL: 'redis://localhost' + + steps: + - name: Check out Django server for database setup + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install SAML (python3-saml) dependencies + run: sudo apt-get install libxml2-dev libxmlsec1-dev libxmlsec1-openssl + + - name: Set up Node 14 + uses: actions/setup-node@v2 + with: + node-version: 14 + + - uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-v1-${{ hashFiles('posthog/requirements.txt') }} + + - name: Install requirements.txt dependencies with pip + run: | + pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements.txt + - name: Set up databases + env: + SECRET_KEY: 'abcdef' # unsafe - for testing only + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/posthog' + TEST: 'true' + run: python manage.py setup_test_environment + + - name: Install package.json dependencies with Yarn + run: cd plugin-server && yarn + + - name: Test with Jest + env: + # Below DB name has `test_` prepended, as that's how Django (ran above) creates the test DB + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/test_posthog' + REDIS_URL: 'redis://localhost' + run: cd plugin-server && yarn typescript:check && yarn test:postgres:2 + + tests-clickhouse-1: + name: Plugin Server Tests / ClickHouse + Kafka (1) + runs-on: ubuntu-20.04 + + services: + postgres: + image: postgres:12 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_posthog + ports: ['5432:5432'] + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis + ports: + - '6379:6379' + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + REDIS_URL: 'redis://localhost' + CLICKHOUSE_HOST: 'localhost' + CLICKHOUSE_DATABASE: 'posthog_test' + KAFKA_ENABLED: 'true' + KAFKA_HOSTS: 'kafka:9092' + + steps: + - name: Check out Django server for database setup + uses: actions/checkout@v2 + + - name: Fix Kafka Hostname + run: | + sudo bash -c 'echo "127.0.0.1 kafka zookeeper" >> /etc/hosts' + ping -c 1 kafka + ping -c 1 zookeeper + - name: Start Kafka, ClickHouse, Zookeeper + run: docker-compose -f ee/docker-compose.ch.yml up -d zookeeper kafka clickhouse + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install SAML (python3-saml) dependencies + run: sudo apt-get install libxml2-dev libxmlsec1-dev libxmlsec1-openssl + + - name: Set up Node 14 + uses: actions/setup-node@v2 + with: + node-version: 14 + + - uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-v1-${{ hashFiles('posthog/requirements.txt') }} + + - name: Install requirements.txt dependencies with pip + run: | + pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements.txt + - name: Set up databases + env: + SECRET_KEY: 'abcdef' # unsafe - for testing only + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/posthog' + PRIMARY_DB: 'clickhouse' + TEST: 'true' + run: python manage.py setup_test_environment + + - name: Install package.json dependencies with Yarn + run: cd plugin-server && yarn + + - name: Test with Jest + env: + # Below DB name has `test_` prepended, as that's how Django (ran above) creates the test DB + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/test_posthog' + REDIS_URL: 'redis://localhost' + run: cd plugin-server && yarn typescript:check && yarn test:clickhouse:1 + + tests-clickhouse-2: + name: Plugin Server Tests / ClickHouse + Kafka (2) + runs-on: ubuntu-20.04 + + services: + postgres: + image: postgres:12 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: test_posthog + ports: ['5432:5432'] + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + redis: + image: redis + ports: + - '6379:6379' + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + env: + REDIS_URL: 'redis://localhost' + CLICKHOUSE_HOST: 'localhost' + CLICKHOUSE_DATABASE: 'posthog_test' + KAFKA_ENABLED: 'true' + KAFKA_HOSTS: 'kafka:9092' + + steps: + - name: Check out Django server for database setup + uses: actions/checkout@v2 + + - name: Fix Kafka Hostname + run: | + sudo bash -c 'echo "127.0.0.1 kafka zookeeper" >> /etc/hosts' + ping -c 1 kafka + ping -c 1 zookeeper + - name: Start Kafka, ClickHouse, Zookeeper + run: docker-compose -f ee/docker-compose.ch.yml up -d zookeeper kafka clickhouse + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install SAML (python3-saml) dependencies + run: sudo apt-get install libxml2-dev libxmlsec1-dev libxmlsec1-openssl + + - name: Set up Node 14 + uses: actions/setup-node@v2 + with: + node-version: 14 + + - uses: actions/cache@v2 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.pythonLocation }}-v1-${{ hashFiles('posthog/requirements.txt') }} + + - name: Install requirements.txt dependencies with pip + run: | + pip install --upgrade pip + pip install --upgrade --upgrade-strategy eager -r requirements.txt + - name: Set up databases + env: + SECRET_KEY: 'abcdef' # unsafe - for testing only + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/posthog' + PRIMARY_DB: 'clickhouse' + TEST: 'true' + run: python manage.py setup_test_environment + + - name: Install package.json dependencies with Yarn + run: cd plugin-server && yarn + + - name: Test with Jest + env: + # Below DB name has `test_` prepended, as that's how Django (ran above) creates the test DB + DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/test_posthog' + REDIS_URL: 'redis://localhost' + run: cd plugin-server && yarn typescript:check && yarn test:clickhouse:2 diff --git a/.github/workflows/posthog-foss.yml b/.github/workflows/posthog-foss.yml deleted file mode 100644 index aef823077fad6..0000000000000 --- a/.github/workflows/posthog-foss.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: PostHog FOSS - -on: - push: - branches: - - master - -jobs: - repo-sync: - name: Sync posthog-foss with posthog - runs-on: ubuntu-latest - steps: - - name: Sync repositories 1 to 1 - uses: ungless/git-sync@master # tag syncing is not currently part of wei/git-sync - with: - source_repo: 'https://${{ secrets.SYNC_GITHUB_TOKEN }}@github.com/posthog/posthog.git' - source_branch: 'master' - destination_repo: 'https://${{ secrets.SYNC_GITHUB_TOKEN }}@github.com/posthog/posthog-foss.git' - destination_branch: 'master' - - name: Checkout posthog-foss - uses: actions/checkout@v2 - with: - repository: 'posthog/posthog-foss' - ref: master - token: ${{ secrets.SYNC_GITHUB_TOKEN }} # SYNC_GITHUB_TOKEN is a PAT token with the workflows scope which is not in GITHUB_TOKEN - - name: Commit "Sync and remove all non-FOSS parts" - uses: EndBug/add-and-commit@v4 - with: - author_name: PostHog Bot - author_email: hey@posthog.com - message: 'Sync and remove all non-FOSS parts' - remove: '-r ee/' - env: - GITHUB_TOKEN: ${{ secrets.SYNC_GITHUB_TOKEN }} diff --git a/.github/workflows/prod-container.yml b/.github/workflows/prod-container.yml index e107901259f68..e1ffc93ff9262 100644 --- a/.github/workflows/prod-container.yml +++ b/.github/workflows/prod-container.yml @@ -2,11 +2,14 @@ name: Build & Deploy Production Containers on: push: - branches: master + branches: + - master + - main jobs: build: name: Build & Deploy Production Docker image + if: github.repository == 'PostHog/posthog' runs-on: ubuntu-20.04 steps: - name: Configure AWS credentials @@ -20,15 +23,17 @@ jobs: id: login-ecr uses: aws-actions/amazon-ecr-login@v1 - - name: Fetch posthog-production + - name: Fetch posthog-cloud run: | - curl -L https://github.com/posthog/posthog-production/tarball/master | tar --strip-components=1 -xz -- + curl -L https://github.com/posthog/posthog-cloud/tarball/master | tar --strip-components=1 -xz -- mkdir deploy/ - name: Checkout master uses: actions/checkout@v2 with: - ref: 'master' + # ref defaults to whatever branch the action is operating on. Leaving it blank on master or main will make it + # easier to switch if we decide to. Also makes branch deploys easier. + # ref: 'master' path: 'deploy/' - name: Build, tag, and push image to Amazon ECR @@ -43,17 +48,91 @@ jobs: echo "::set-output name=image::$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" - name: Fill in the new image ID in the Amazon ECS task definition - id: task-def + id: task-def-web uses: aws-actions/amazon-ecs-render-task-definition@v1 with: - task-definition: deploy/task-definition.json + task-definition: deploy/task-definition.web.json container-name: posthog-production image: ${{ steps.build-image.outputs.image }} - - name: Deploy Amazon ECS task definition + - name: Fill in the new worker image ID in the Amazon ECS task definition + id: task-def-worker + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: deploy/task-definition.worker.json + container-name: posthog-production-worker + image: ${{ steps.build-image.outputs.image }} + + - name: Fill in the new plugins image ID in the Amazon ECS task definition + id: task-def-plugins + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: deploy/task-definition.plugins.json + container-name: posthog-production-plugins + image: ${{ steps.build-image.outputs.image }} + + - name: Fill in the new migration image ID in the Amazon ECS task definition + id: task-def-migrate + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: deploy/task-definition.migration.json + container-name: posthog-production-migration + image: ${{ steps.build-image.outputs.image }} + + - name: Perform migrations + run: | + aws ecs register-task-definition --cli-input-json file://$TASK_DEFINITION + aws ecs run-task --cluster posthog-production-cluster --count 1 --launch-type FARGATE --task-definition posthog-production-migration --network-configuration '{ + "awsvpcConfiguration": { + "subnets": ["subnet-8738fde1"], + "securityGroups": ["sg-05a5f7e510b15473c"], + "assignPublicIp": "ENABLED" + }}' + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: 'us-east-1' + TASK_DEFINITION: ${{ steps.task-def-migrate.outputs.task-definition }} + + - name: Deploy Amazon ECS web task definition uses: aws-actions/amazon-ecs-deploy-task-definition@v1 with: - task-definition: ${{ steps.task-def.outputs.task-definition }} + task-definition: ${{ steps.task-def-web.outputs.task-definition }} service: posthog-production cluster: posthog-production-cluster - wait-for-service-stability: true + + - name: Deploy Amazon ECS events task definition + uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + with: + task-definition: ${{ steps.task-def-web.outputs.task-definition }} + service: posthog-production-event + cluster: posthog-production-cluster + + - name: Deploy Amazon ECS worker task definition + uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + with: + task-definition: ${{ steps.task-def-worker.outputs.task-definition }} + service: posthog-production-worker + cluster: posthog-production-cluster + + - name: Deploy Amazon ECS plugins task definition + uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + with: + task-definition: ${{ steps.task-def-plugins.outputs.task-definition }} + service: posthog-production-plugins + cluster: posthog-production-cluster + sentry: + name: Notify Sentry of a production release + runs-on: ubuntu-20.04 + if: github.repository == 'PostHog/posthog' + steps: + - name: Checkout master + uses: actions/checkout@v2 + - name: Notify Sentry + uses: getsentry/action-release@v1 + env: + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + SENTRY_ORG: posthog + SENTRY_PROJECT: posthog + with: + environment: production diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml new file mode 100644 index 0000000000000..6000c56a553e5 --- /dev/null +++ b/.github/workflows/stale.yaml @@ -0,0 +1,20 @@ +name: 'Handle stale PRs' +on: + schedule: + - cron: '30 7 * * 1-5' + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v4 + with: + days-before-issue-stale: 9999 + stale-pr-message: "This PR hasn't seen activity in a week! Should it be merged, closed, or further worked on? If you want to keep it open, post a comment or remove the `stale` label – otherwise this will be closed in another week." + close-pr-message: 'This PR was closed due to 2 weeks of inactivity. Feel free to reopen it if still relevant.' + days-before-pr-stale: 7 + days-before-pr-close: 7 + stale-issue-label: stale + stale-pr-label: stale + operations-per-run: 30 + repo-token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }} diff --git a/.github/workflows/storybook-chromatic.yml b/.github/workflows/storybook-chromatic.yml new file mode 100644 index 0000000000000..cd807fba3a16b --- /dev/null +++ b/.github/workflows/storybook-chromatic.yml @@ -0,0 +1,22 @@ +name: 'Storybook Chromatic' + +on: pull_request + +jobs: + storybook-chromatic: + runs-on: ubuntu-latest + if: github.event.pull_request.head.repo.full_name == github.repository + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 # 👈 Required to retrieve git history (https://www.chromatic.com/docs/github-actions) + + - name: Install dependencies and chromatic + run: yarn add --dev chromatic + + - name: Publish to Chromatic + uses: chromaui/action@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + # 👇 Chromatic projectToken, refer to the manage page to obtain it. + projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }} diff --git a/.github/workflows/storybook-deploy.yml b/.github/workflows/storybook-deploy.yml new file mode 100644 index 0000000000000..a7706dd97400e --- /dev/null +++ b/.github/workflows/storybook-deploy.yml @@ -0,0 +1,47 @@ +name: 'Storybook Deployment' + +on: + push: + branches: + - master + - main + +jobs: + storybook-deployment: + runs-on: ubuntu-latest + if: github.repository == 'PostHog/posthog' + steps: + - name: Check out PostHog/posthog repo + uses: actions/checkout@v2 + with: + path: posthog + fetch-depth: 0 + + - name: Install dependencies (yarn) + run: cd posthog && yarn + + - name: Build storybook + run: cd posthog && yarn build-storybook + + - name: Check out PostHog/storybook-build repo + uses: actions/checkout@v2 + with: + path: storybook-build + repository: PostHog/storybook-build + token: ${{ secrets.POSTHOG_BOT_GITHUB_TOKEN }} + + - name: Copy built changes to PostHog/storybook-build repo + run: | + # keep the CNAME file, but discard all the rest + cp storybook-build/docs/CNAME posthog/storybook-static/ + rm -rf storybook-build/docs + cp -a posthog/storybook-static storybook-build/docs + + - name: Commit update + uses: stefanzweifel/git-auto-commit-action@v4 + with: + repository: storybook-build + commit_message: 'Storybook build' + commit_user_name: PostHog Bot + commit_user_email: hey@posthog.com + commit_author: PostHog Bot diff --git a/.github/workflows/version-update.yml b/.github/workflows/version-update.yml deleted file mode 100644 index a5a961f8a8678..0000000000000 --- a/.github/workflows/version-update.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: Version Update - -on: - push: - tags: - - '*.**' - -jobs: - update-version: - name: Update VERSION - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v2 - with: - ref: master - fetch-depth: 0 - - - name: Edit version.py - run: echo "VERSION = \"$(git describe --tags `git rev-list --tags --max-count=1`)\"" > posthog/version.py - - - name: Commit update - uses: stefanzweifel/git-auto-commit-action@v4 - with: - commit_message: 'Update VERSION' - commit_user_name: PostHog Bot - commit_user_email: hey@posthog.com - commit_author: PostHog Bot diff --git a/.gitignore b/.gitignore index fd52979ea88d6..053a6a8554991 100644 --- a/.gitignore +++ b/.gitignore @@ -10,22 +10,29 @@ settings.yml debug.log *.swp *.swo -impliu/static/CACHE/* -node_modules/* +node_modules/ *.code-workspace node_modules -.vscode/* +.vscode/ frontend/.cache/ .mypy_cache -frontend/dist/* -frontend/types/* +frontend/dist/ +frontend/types/ +*Type.ts frontend/yarn-error.log +frontend/tmp .idea celerybeat-schedule celerybeat.pid yarn-error.log .yalc yalc.lock -cypress/screenshots/* +cypress/screenshots/ docker-compose.prod.yml -.python-version \ No newline at end of file +.python-version +*.isorted +build-storybook.log +storybook-static +ee/benchmarks/results +plugin-server/src/config/idl/protos.* + diff --git a/.kearc b/.kearc index 2e92bf8f5e8cc..7fb5652f271a3 100644 --- a/.kearc +++ b/.kearc @@ -1,6 +1,7 @@ { "tsConfigPath": "./tsconfig.json", "rootPath": "./frontend/src", - "typesPath": "./frontend/types" + "typesPath": "./frontend/src", + "writePaths": true } diff --git a/.platform/applications.yaml b/.platform/applications.yaml index eb315561a3d70..5ed4ac0f9cf15 100644 --- a/.platform/applications.yaml +++ b/.platform/applications.yaml @@ -3,8 +3,8 @@ build: flavor: none relationships: - postgresdatabase: "dbpostgres:postgresql" - cache: "redis:redis" + postgresdatabase: 'dbpostgres:postgresql' + cache: 'redis:redis' hooks: build: | # Install NVM and a recet version of node @@ -25,13 +25,14 @@ deploy: python manage.py migrate web: commands: - start: "gunicorn posthog.wsgi --log-file -" + start: 'gunicorn posthog.wsgi --log-file -' variables: env: DATABASE_URL: postgres://main:main@postgresdatabase.internal:5432/main REDIS_URL: redis://cache.internal:6379 NODE_OPTIONS: --max_old_space_size=1536 + DEPLOYMENT: Platform.sh workers: - worker: - commands: - start: celery -A posthog worker --beat --scheduler redbeat.RedBeatScheduler --loglevel=info --pidfile="/tmp/celerybeat.pid" --concurrency=2 \ No newline at end of file + worker: + commands: + start: celery -A posthog worker --beat --scheduler redbeat.RedBeatScheduler --loglevel=info --pidfile="/tmp/celerybeat.pid" --concurrency=2 --without-heartbeat --without-gossip --without-mingle diff --git a/.platform/routes.yaml b/.platform/routes.yaml index 1c6790fd2b205..8ee8cd0ee2693 100644 --- a/.platform/routes.yaml +++ b/.platform/routes.yaml @@ -3,10 +3,10 @@ # Each route describes how an incoming URL is going # to be processed by Platform.sh. -"https://{default}/": +'https://{default}/': type: upstream - upstream: "app:http" + upstream: 'app:http' -"https://www.{default}/": +'https://www.{default}/': type: redirect - to: "https://{default}/" \ No newline at end of file + to: 'https://{default}/' diff --git a/.platform/services.yaml b/.platform/services.yaml index e68ed7a0442a3..8e823a528d09c 100644 --- a/.platform/services.yaml +++ b/.platform/services.yaml @@ -3,4 +3,4 @@ dbpostgres: disk: 2048 redis: type: redis-persistent:5.0 - disk: 512 \ No newline at end of file + disk: 512 diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000000000..8e91779abcd99 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,17 @@ +venv +env +.venv +__pycache__/ +staticfiles +.env +*.code-workspace +frontend/.cache/ +.mypy_cache +frontend/dist/ +*Type.ts +.idea +.yalc +.python-version +storybook-static +dist/ +node_modules/ \ No newline at end of file diff --git a/.run/Celery - ClickHouse.run.xml b/.run/Celery - ClickHouse.run.xml new file mode 100644 index 0000000000000..ae109c33ac8fe --- /dev/null +++ b/.run/Celery - ClickHouse.run.xml @@ -0,0 +1,29 @@ + + + + + \ No newline at end of file diff --git a/.run/Celery - Postgres.run.xml b/.run/Celery - Postgres.run.xml new file mode 100644 index 0000000000000..dc53a456b6c7c --- /dev/null +++ b/.run/Celery - Postgres.run.xml @@ -0,0 +1,25 @@ + + + + + \ No newline at end of file diff --git a/.run/Frontend.run.xml b/.run/Frontend.run.xml new file mode 100644 index 0000000000000..8d919432e1c03 --- /dev/null +++ b/.run/Frontend.run.xml @@ -0,0 +1,12 @@ + + + + + + +INC +$im->include($inc); +$im->filledRectangle(0, 0, $imagewidth, $imageheight, 'url(#background)'); +$im->stringTTF("title", int($imagewidth / 2), $fontsize * 2, $titletext); +$im->stringTTF("subtitle", int($imagewidth / 2), $fontsize * 4, $subtitletext) if $subtitletext ne ""; +$im->stringTTF("details", $xpad, $imageheight - ($ypad2 / 2), " "); +$im->stringTTF("unzoom", $xpad, $fontsize * 2, "Reset Zoom", 'class="hide"'); +$im->stringTTF("search", $imagewidth - $xpad - 100, $fontsize * 2, "Search"); +$im->stringTTF("ignorecase", $imagewidth - $xpad - 16, $fontsize * 2, "ic"); +$im->stringTTF("matched", $imagewidth - $xpad - 100, $imageheight - ($ypad2 / 2), " "); + +if ($palette) { + read_palette(); +} + +# draw frames +$im->group_start({id => "frames"}); +while (my ($id, $node) = each %Node) { + my ($func, $depth, $etime) = split ";", $id; + my $stime = $node->{stime}; + my $delta = $node->{delta}; + + $etime = $timemax if $func eq "" and $depth == 0; + + my $x1 = $xpad + $stime * $widthpertime; + my $x2 = $xpad + $etime * $widthpertime; + my ($y1, $y2); + unless ($inverted) { + $y1 = $imageheight - $ypad2 - ($depth + 1) * $frameheight + $framepad; + $y2 = $imageheight - $ypad2 - $depth * $frameheight; + } else { + $y1 = $ypad1 + $depth * $frameheight; + $y2 = $ypad1 + ($depth + 1) * $frameheight - $framepad; + } + + my $samples = sprintf "%.0f", ($etime - $stime) * $factor; + (my $samples_txt = $samples) # add commas per perlfaq5 + =~ s/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/$1,/g; + + my $info; + if ($func eq "" and $depth == 0) { + $info = "all ($samples_txt $countname, 100%)"; + } else { + my $pct = sprintf "%.2f", ((100 * $samples) / ($timemax * $factor)); + my $escaped_func = $func; + # clean up SVG breaking characters: + $escaped_func =~ s/&/&/g; + $escaped_func =~ s//>/g; + $escaped_func =~ s/"/"/g; + $escaped_func =~ s/_\[[kwij]\]$//; # strip any annotation + unless (defined $delta) { + $info = "$escaped_func ($samples_txt $countname, $pct%)"; + } else { + my $d = $negate ? -$delta : $delta; + my $deltapct = sprintf "%.2f", ((100 * $d) / ($timemax * $factor)); + $deltapct = $d > 0 ? "+$deltapct" : $deltapct; + $info = "$escaped_func ($samples_txt $countname, $pct%; $deltapct%)"; + } + } + + my $nameattr = { %{ $nameattr{$func}||{} } }; # shallow clone + $nameattr->{title} ||= $info; + $im->group_start($nameattr); + + my $color; + if ($func eq "--") { + $color = $vdgrey; + } elsif ($func eq "-") { + $color = $dgrey; + } elsif (defined $delta) { + $color = color_scale($delta, $maxdelta); + } elsif ($palette) { + $color = color_map($colors, $func); + } else { + $color = color($colors, $hash, $func); + } + $im->filledRectangle($x1, $y1, $x2, $y2, $color, 'rx="2" ry="2"'); + + my $chars = int( ($x2 - $x1) / ($fontsize * $fontwidth)); + my $text = ""; + if ($chars >= 3) { # room for one char plus two dots + $func =~ s/_\[[kwij]\]$//; # strip any annotation + $text = substr $func, 0, $chars; + substr($text, -2, 2) = ".." if $chars < length $func; + $text =~ s/&/&/g; + $text =~ s//>/g; + } + $im->stringTTF(undef, $x1 + 3, 3 + ($y1 + $y2) / 2, $text); + + $im->group_end($nameattr); +} +$im->group_end(); + +print $im->svg; + +if ($palette) { + write_palette(); +} + +# vim: ts=8 sts=8 sw=8 noexpandtab diff --git a/ee/clickhouse/clickhouse_test_runner.py b/ee/clickhouse/clickhouse_test_runner.py deleted file mode 100644 index 486e89de7841e..0000000000000 --- a/ee/clickhouse/clickhouse_test_runner.py +++ /dev/null @@ -1,41 +0,0 @@ -from django.test.runner import DiscoverRunner -from infi.clickhouse_orm import Database - -from ee.clickhouse.client import sync_execute -from posthog.settings import ( - CLICKHOUSE_DATABASE, - CLICKHOUSE_HTTP_URL, - CLICKHOUSE_PASSWORD, - CLICKHOUSE_USERNAME, - CLICKHOUSE_VERIFY, -) - - -class ClickhouseTestRunner(DiscoverRunner): - def get_database(self) -> Database: - return Database( - CLICKHOUSE_DATABASE, - db_url=CLICKHOUSE_HTTP_URL, - username=CLICKHOUSE_USERNAME, - password=CLICKHOUSE_PASSWORD, - verify_ssl_cert=CLICKHOUSE_VERIFY, - ) - - def setup_databases(self, **kwargs): - database = self.get_database() - try: - database.drop_database() - except: - pass - database.create_database() - database.migrate("ee.clickhouse.migrations") - # Make DELETE / UPDATE synchronous to avoid flaky tests - sync_execute("SET mutations_sync = 1") - return super().setup_databases(**kwargs) - - def teardown_databases(self, old_config, **kwargs): - try: - self.get_database().drop_database() - except: - pass - super().teardown_databases(old_config, **kwargs) diff --git a/ee/clickhouse/client.py b/ee/clickhouse/client.py index 6e775c831b8a3..23b0d274d3a8b 100644 --- a/ee/clickhouse/client.py +++ b/ee/clickhouse/client.py @@ -1,45 +1,86 @@ import asyncio import hashlib import json -from time import time -from typing import Any, List, Tuple +import types +from time import perf_counter +from typing import Any, Dict, List, Optional, Tuple, Union import sqlparse from aioch import Client from asgiref.sync import async_to_sync from clickhouse_driver import Client as SyncClient +from clickhouse_driver.util.escape import escape_params from clickhouse_pool import ChPool -from django.conf import settings +from django.conf import settings as app_settings +from django.core.cache import cache +from django.utils.timezone import now +from sentry_sdk.api import capture_exception +from ee.clickhouse.errors import wrap_query_error +from ee.clickhouse.timer import get_timer_thread from posthog import redis +from posthog.constants import AnalyticsDBMS +from posthog.internal_metrics import incr, timing from posthog.settings import ( - CLICKHOUSE, CLICKHOUSE_ASYNC, CLICKHOUSE_CA, + CLICKHOUSE_CONN_POOL_MAX, + CLICKHOUSE_CONN_POOL_MIN, CLICKHOUSE_DATABASE, CLICKHOUSE_HOST, CLICKHOUSE_PASSWORD, CLICKHOUSE_SECURE, + CLICKHOUSE_USER, CLICKHOUSE_VERIFY, PRIMARY_DB, TEST, ) +from posthog.utils import get_safe_cache + +InsertParams = Union[list, tuple, types.GeneratorType] +NonInsertParams = Union[Dict[str, Any]] +QueryArgs = Optional[Union[InsertParams, NonInsertParams]] CACHE_TTL = 60 # seconds +SLOW_QUERY_THRESHOLD_MS = 15000 +QUERY_TIMEOUT_THREAD = get_timer_thread("ee.clickhouse.client", SLOW_QUERY_THRESHOLD_MS) + +_request_information: Optional[Dict] = None + +def make_ch_pool(**overrides) -> ChPool: + kwargs = { + "host": CLICKHOUSE_HOST, + "database": CLICKHOUSE_DATABASE, + "secure": CLICKHOUSE_SECURE, + "user": CLICKHOUSE_USER, + "password": CLICKHOUSE_PASSWORD, + "ca_certs": CLICKHOUSE_CA, + "verify": CLICKHOUSE_VERIFY, + "connections_min": CLICKHOUSE_CONN_POOL_MIN, + "connections_max": CLICKHOUSE_CONN_POOL_MAX, + "settings": {"mutations_sync": "1"} if TEST else {}, + **overrides, + } -if PRIMARY_DB != CLICKHOUSE: + return ChPool(**kwargs) + + +if PRIMARY_DB != AnalyticsDBMS.CLICKHOUSE: ch_client = None # type: Client - ch_sync_pool = None # type: ChPool - def async_execute(query, args=None): - return + class ClickHouseNotConfigured(NotImplementedError): + def __init__(self, msg='This function only works if PRIMARY_DB is set to indicate ClickHouse!"', *args): + super().__init__(msg, *args) - def sync_execute(query, args=None): - return + def async_execute(query, args=None, settings=None, with_column_types=False): + raise ClickHouseNotConfigured() - def cache_sync_execute(query, args=None, redis_client=None, ttl=None): - return + def sync_execute(query, args=None, settings=None, with_column_types=False): + raise ClickHouseNotConfigured() + + def cache_sync_execute(query, args=None, redis_client=None, ttl=None, settings=None, with_column_types=False): + raise ClickHouseNotConfigured() else: @@ -48,15 +89,20 @@ def cache_sync_execute(query, args=None, redis_client=None, ttl=None): host=CLICKHOUSE_HOST, database=CLICKHOUSE_DATABASE, secure=CLICKHOUSE_SECURE, + user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, ca_certs=CLICKHOUSE_CA, verify=CLICKHOUSE_VERIFY, ) + ch_pool = make_ch_pool() + @async_to_sync - async def async_execute(query, args=None): + async def async_execute(query, args=None, settings=None, with_column_types=False): loop = asyncio.get_event_loop() - task = loop.create_task(ch_client.execute(query, args)) + task = loop.create_task( + ch_client.execute(query, args, settings=settings, with_column_types=with_column_types) + ) return task else: @@ -65,26 +111,19 @@ async def async_execute(query, args=None): host=CLICKHOUSE_HOST, database=CLICKHOUSE_DATABASE, secure=CLICKHOUSE_SECURE, + user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, ca_certs=CLICKHOUSE_CA, verify=CLICKHOUSE_VERIFY, + settings={"mutations_sync": "1"} if TEST else {}, ) - def async_execute(query, args=None): - return sync_execute(query, args) - - ch_sync_pool = ChPool( - host=CLICKHOUSE_HOST, - database=CLICKHOUSE_DATABASE, - secure=CLICKHOUSE_SECURE, - password=CLICKHOUSE_PASSWORD, - ca_certs=CLICKHOUSE_CA, - verify=CLICKHOUSE_VERIFY, - connections_min=20, - connections_max=100, - ) - - def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL): + ch_pool = make_ch_pool() + + def async_execute(query, args=None, settings=None, with_column_types=False): + return sync_execute(query, args, settings=settings, with_column_types=with_column_types) + + def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL, settings=None, with_column_types=False): if not redis_client: redis_client = redis.get_client() key = _key_hash(query, args) @@ -92,23 +131,80 @@ def cache_sync_execute(query, args=None, redis_client=None, ttl=CACHE_TTL): result = _deserialize(redis_client.get(key)) return result else: - result = sync_execute(query, args) + result = sync_execute(query, args, settings=settings, with_column_types=with_column_types) redis_client.set(key, _serialize(result), ex=ttl) return result - def sync_execute(query, args=None): - start_time = time() - try: - with ch_sync_pool.get_client() as client: - result = client.execute(query, args) - finally: - execution_time = time() - start_time - if settings.SHELL_PLUS_PRINT_SQL: - print(format_sql(query, args)) - print("Execution time: %.6fs" % (execution_time,)) + def sync_execute(query, args=None, settings=None, with_column_types=False): + with ch_pool.get_client() as client: + start_time = perf_counter() + + prepared_sql, prepared_args, tags = _prepare_query(client=client, query=query, args=args) + + timeout_task = QUERY_TIMEOUT_THREAD.schedule(_notify_of_slow_query_failure, tags) + + try: + result = client.execute( + prepared_sql, params=prepared_args, settings=settings, with_column_types=with_column_types + ) + except Exception as err: + err = wrap_query_error(err) + tags["failed"] = True + tags["reason"] = type(err).__name__ + incr("clickhouse_sync_execution_failure", tags=tags) + + raise err + finally: + execution_time = perf_counter() - start_time + + QUERY_TIMEOUT_THREAD.cancel(timeout_task) + timing("clickhouse_sync_execution_time", execution_time * 1000.0, tags=tags) + + if app_settings.SHELL_PLUS_PRINT_SQL: + print("Execution time: %.6fs" % (execution_time,)) + if _request_information is not None and _request_information.get("save", False): + save_query(prepared_sql, execution_time) return result +def _prepare_query(client: SyncClient, query: str, args: QueryArgs): + """ + Given a string query with placeholders we do one of two things: + + 1. for a insert query we just format, and remove comments + 2. for non-insert queries, we return the sql with placeholders + evaluated with the contents of `args` + + We also return `tags` which contains some detail around the context + within which the query was executed e.g. the django view name + + NOTE: `client.execute` would normally handle substitution, but + because we want to strip the comments to make it easier to copy + and past queries from the `system.query_log` easily with metabase + (metabase doesn't show new lines, so with comments, you can't get + a working query without exporting to csv or similar), we need to + do it manually. + + We only want to try to substitue for SELECT queries, which + clickhouse_driver at this moment in time decides based on the + below predicate. + """ + if isinstance(args, (list, tuple, types.GeneratorType)): + rendered_sql = query + else: + rendered_sql = client.substitute_params(query, args or {}) + args = None + + formatted_sql = sqlparse.format(rendered_sql, strip_comments=True) + annotated_sql, tags = _annotate_tagged_query(formatted_sql, args) + + if app_settings.SHELL_PLUS_PRINT_SQL: + print() + print(format_sql(formatted_sql)) + + return annotated_sql, args, tags + + def _deserialize(result_bytes: bytes) -> List[Tuple]: results = [] for x in json.loads(result_bytes): @@ -125,19 +221,62 @@ def _key_hash(query: str, args: Any) -> bytes: return key -def format_sql(sql, params): - substitute_params = ( - ch_client.substitute_params if isinstance(ch_client, SyncClient) else ch_client._client.substitute_params - ) +def _annotate_tagged_query(query, args): + """ + Adds in a /* */ so we can look in clickhouses `system.query_log` + to easily marry up to the generating code. + """ + tags = {"kind": (_request_information or {}).get("kind"), "id": (_request_information or {}).get("id")} + if isinstance(args, dict) and "team_id" in args: + tags["team_id"] = args["team_id"] + # Annotate the query with information on the request/task + if _request_information is not None: + query = f"/* {_request_information['kind']}:{_request_information['id'].replace('/', '_')} */ {query}" - sql = substitute_params(sql, params or {}) - sql = sqlparse.format(sql, reindent_aligned=True) - try: - import pygments.formatters - import pygments.lexers + return query, tags + + +def _notify_of_slow_query_failure(tags: Dict[str, Any]): + tags["failed"] = True + tags["reason"] = "timeout" + incr("clickhouse_sync_execution_failure", tags=tags) - sql = pygments.highlight(sql, pygments.lexers.get_lexer_by_name("sql"), pygments.formatters.TerminalFormatter()) - except: - pass - return sql +def format_sql(rendered_sql, colorize=True): + formatted_sql = sqlparse.format(rendered_sql, reindent_aligned=True) + if colorize: + try: + import pygments.formatters + import pygments.lexers + + return pygments.highlight( + formatted_sql, pygments.lexers.get_lexer_by_name("sql"), pygments.formatters.TerminalFormatter() + ) + except: + pass + + return formatted_sql + + +def save_query(sql: str, execution_time: float) -> None: + """ + Save query for debugging purposes + """ + if _request_information is None: + return + + try: + key = "save_query_{}".format(_request_information["user_id"]) + queries = json.loads(get_safe_cache(key) or "[]") + + queries.insert( + 0, + { + "timestamp": now().isoformat(), + "query": format_sql(sql, colorize=False), + "execution_time": execution_time, + }, + ) + cache.set(key, json.dumps(queries), timeout=120) + except Exception as e: + capture_exception(e) diff --git a/ee/clickhouse/demo.py b/ee/clickhouse/demo.py index 9045679b8ddce..f9fd1a9668019 100644 --- a/ee/clickhouse/demo.py +++ b/ee/clickhouse/demo.py @@ -1,157 +1,15 @@ -import json -import random -from pathlib import Path -from typing import List +from typing import Dict, List from uuid import uuid4 -from dateutil.relativedelta import relativedelta -from django.utils.timezone import now - -from ee.clickhouse.models.clickhouse import generate_clickhouse_uuid from ee.clickhouse.models.event import create_event -from ee.clickhouse.models.person import update_person_is_identified, update_person_properties -from posthog.models import Team -from posthog.models.element import Element -from posthog.models.person import Person - - -def create_anonymous_users_ch(team: Team, base_url: str) -> None: - with open(Path("posthog/demo_data.json").resolve(), "r") as demo_data_file: - demo_data = json.load(demo_data_file) - - demo_data_index = 0 - days_ago = 7 - for index in range(0, 100): - if index > 0 and index % 14 == 0: - days_ago -= 1 - - date = now() - relativedelta(days=days_ago) - browser = random.choice(["Chrome", "Safari", "Firefox"]) - - distinct_id = generate_clickhouse_uuid() - person = Person.objects.create(team_id=team.pk, distinct_ids=[distinct_id], properties={"is_demo": True}) - - event_uuid = uuid4() - create_event( - team=team, - event="$pageview", - distinct_id=distinct_id, - properties={"$current_url": base_url, "$browser": browser, "$lib": "web",}, - timestamp=date, - event_uuid=event_uuid, - ) - - if index % 3 == 0: - - update_person_properties(team_id=team.pk, id=person.uuid, properties=demo_data[demo_data_index]) - update_person_is_identified(team_id=team.pk, id=person.uuid, is_identified=True) - demo_data_index += 1 - - create_event( - team=team, - distinct_id=distinct_id, - event="$autocapture", - properties={"$current_url": base_url, "$browser": browser, "$lib": "web", "$event_type": "click",}, - timestamp=date + relativedelta(seconds=14), - event_uuid=event_uuid, - elements=[ - Element( - tag_name="a", - href="/demo/1", - attr_class=["btn", "btn-success"], - attr_id="sign-up", - text="Sign up", - ), - Element(tag_name="form", attr_class=["form"]), - Element(tag_name="div", attr_class=["container"]), - Element(tag_name="body"), - Element(tag_name="html"), - ], - ) - - event_uuid = uuid4() - create_event( - event="$pageview", - team=team, - distinct_id=distinct_id, - properties={"$current_url": "%s/1" % base_url, "$browser": browser, "$lib": "web",}, - timestamp=date + relativedelta(seconds=15), - event_uuid=event_uuid, - ) - - if index % 4 == 0: - create_event( - team=team, - event="$autocapture", - distinct_id=distinct_id, - properties={ - "$current_url": "%s/1" % base_url, - "$browser": browser, - "$lib": "web", - "$event_type": "click", - }, - timestamp=date + relativedelta(seconds=29), - event_uuid=event_uuid, - elements=[ - Element(tag_name="button", attr_class=["btn", "btn-success"], text="Sign up!",), - Element(tag_name="form", attr_class=["form"]), - Element(tag_name="div", attr_class=["container"]), - Element(tag_name="body"), - Element(tag_name="html"), - ], - ) - - event_uuid = uuid4() - create_event( - event="$pageview", - team=team, - distinct_id=distinct_id, - properties={"$current_url": "%s/2" % base_url, "$browser": browser, "$lib": "web",}, - timestamp=date + relativedelta(seconds=30), - event_uuid=event_uuid, - ) +from ee.clickhouse.models.session_recording_event import create_session_recording_event - if index % 5 == 0: - create_event( - team=team, - event="$autocapture", - distinct_id=distinct_id, - properties={ - "$current_url": "%s/2" % base_url, - "$browser": browser, - "$lib": "web", - "$event_type": "click", - }, - timestamp=date + relativedelta(seconds=59), - event_uuid=event_uuid, - elements=[ - Element(tag_name="button", attr_class=["btn", "btn-success"], text="Pay $10",), - Element(tag_name="form", attr_class=["form"]), - Element(tag_name="div", attr_class=["container"]), - Element(tag_name="body"), - Element(tag_name="html"), - ], - ) - event_uuid = uuid4() - create_event( - event="purchase", - team=team, - distinct_id=distinct_id, - properties={"price": 10}, - timestamp=date + relativedelta(seconds=60), - event_uuid=event_uuid, - ) +def bulk_create_events(events: List[Dict], **kw): + for event_data in events: + create_event(**event_data, **kw, event_uuid=uuid4()) # type: ignore - event_uuid = uuid4() - create_event( - event="$pageview", - team=team, - distinct_id=distinct_id, - properties={"$current_url": "%s/3" % base_url, "$browser": browser, "$lib": "web",}, - timestamp=date + relativedelta(seconds=60), - event_uuid=event_uuid, - ) - team.event_properties_numerical.append("purchase") - team.save() +def bulk_create_session_recording_events(events: List[Dict], **kw): + for data in events: + create_session_recording_event(**data, **kw, uuid=uuid4()) # type: ignore diff --git a/ee/clickhouse/errors.py b/ee/clickhouse/errors.py new file mode 100644 index 0000000000000..98645cca3e22c --- /dev/null +++ b/ee/clickhouse/errors.py @@ -0,0 +1,581 @@ +import re + +from clickhouse_driver.errors import ServerException + +from posthog.exceptions import EstimatedQueryExecutionTimeTooLong + + +def wrap_query_error(err: Exception) -> Exception: + "Beautifies clickhouse client errors, using custom error classes for every code" + if not isinstance(err, ServerException): + return err + + # Return a 512 error for queries which would time out + match = re.search(r"Estimated query execution time \(.* seconds\) is too long.", err.message) + if match: + return EstimatedQueryExecutionTimeTooLong(detail=match.group(0)) + + # :TRICKY: Return a custom class for every code by looking up the short name and creating a class dynamically. + if hasattr(err, "code"): + name = CLICKHOUSE_ERROR_CODE_LOOKUP.get(err.code, "UNKNOWN") + name = f"CHQueryError{name.replace('_', ' ').title().replace(' ', '')}" + return type(name, (ServerException,), {})(err.message, code=err.code) + return err + + +# From https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/ErrorCodes.cpp#L15 +CLICKHOUSE_ERROR_CODE_LOOKUP = { + 0: "OK", + 1: "UNSUPPORTED_METHOD", + 2: "UNSUPPORTED_PARAMETER", + 3: "UNEXPECTED_END_OF_FILE", + 4: "EXPECTED_END_OF_FILE", + 6: "CANNOT_PARSE_TEXT", + 7: "INCORRECT_NUMBER_OF_COLUMNS", + 8: "THERE_IS_NO_COLUMN", + 9: "SIZES_OF_COLUMNS_DOESNT_MATCH", + 10: "NOT_FOUND_COLUMN_IN_BLOCK", + 11: "POSITION_OUT_OF_BOUND", + 12: "PARAMETER_OUT_OF_BOUND", + 13: "SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH", + 15: "DUPLICATE_COLUMN", + 16: "NO_SUCH_COLUMN_IN_TABLE", + 17: "DELIMITER_IN_STRING_LITERAL_DOESNT_MATCH", + 18: "CANNOT_INSERT_ELEMENT_INTO_CONSTANT_COLUMN", + 19: "SIZE_OF_FIXED_STRING_DOESNT_MATCH", + 20: "NUMBER_OF_COLUMNS_DOESNT_MATCH", + 21: "CANNOT_READ_ALL_DATA_FROM_TAB_SEPARATED_INPUT", + 22: "CANNOT_PARSE_ALL_VALUE_FROM_TAB_SEPARATED_INPUT", + 23: "CANNOT_READ_FROM_ISTREAM", + 24: "CANNOT_WRITE_TO_OSTREAM", + 25: "CANNOT_PARSE_ESCAPE_SEQUENCE", + 26: "CANNOT_PARSE_QUOTED_STRING", + 27: "CANNOT_PARSE_INPUT_ASSERTION_FAILED", + 28: "CANNOT_PRINT_FLOAT_OR_DOUBLE_NUMBER", + 29: "CANNOT_PRINT_INTEGER", + 30: "CANNOT_READ_SIZE_OF_COMPRESSED_CHUNK", + 31: "CANNOT_READ_COMPRESSED_CHUNK", + 32: "ATTEMPT_TO_READ_AFTER_EOF", + 33: "CANNOT_READ_ALL_DATA", + 34: "TOO_MANY_ARGUMENTS_FOR_FUNCTION", + 35: "TOO_FEW_ARGUMENTS_FOR_FUNCTION", + 36: "BAD_ARGUMENTS", + 37: "UNKNOWN_ELEMENT_IN_AST", + 38: "CANNOT_PARSE_DATE", + 39: "TOO_LARGE_SIZE_COMPRESSED", + 40: "CHECKSUM_DOESNT_MATCH", + 41: "CANNOT_PARSE_DATETIME", + 42: "NUMBER_OF_ARGUMENTS_DOESNT_MATCH", + 43: "ILLEGAL_TYPE_OF_ARGUMENT", + 44: "ILLEGAL_COLUMN", + 45: "ILLEGAL_NUMBER_OF_RESULT_COLUMNS", + 46: "UNKNOWN_FUNCTION", + 47: "UNKNOWN_IDENTIFIER", + 48: "NOT_IMPLEMENTED", + 49: "LOGICAL_ERROR", + 50: "UNKNOWN_TYPE", + 51: "EMPTY_LIST_OF_COLUMNS_QUERIED", + 52: "COLUMN_QUERIED_MORE_THAN_ONCE", + 53: "TYPE_MISMATCH", + 54: "STORAGE_DOESNT_ALLOW_PARAMETERS", + 55: "STORAGE_REQUIRES_PARAMETER", + 56: "UNKNOWN_STORAGE", + 57: "TABLE_ALREADY_EXISTS", + 58: "TABLE_METADATA_ALREADY_EXISTS", + 59: "ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER", + 60: "UNKNOWN_TABLE", + 61: "ONLY_FILTER_COLUMN_IN_BLOCK", + 62: "SYNTAX_ERROR", + 63: "UNKNOWN_AGGREGATE_FUNCTION", + 64: "CANNOT_READ_AGGREGATE_FUNCTION_FROM_TEXT", + 65: "CANNOT_WRITE_AGGREGATE_FUNCTION_AS_TEXT", + 66: "NOT_A_COLUMN", + 67: "ILLEGAL_KEY_OF_AGGREGATION", + 68: "CANNOT_GET_SIZE_OF_FIELD", + 69: "ARGUMENT_OUT_OF_BOUND", + 70: "CANNOT_CONVERT_TYPE", + 71: "CANNOT_WRITE_AFTER_END_OF_BUFFER", + 72: "CANNOT_PARSE_NUMBER", + 73: "UNKNOWN_FORMAT", + 74: "CANNOT_READ_FROM_FILE_DESCRIPTOR", + 75: "CANNOT_WRITE_TO_FILE_DESCRIPTOR", + 76: "CANNOT_OPEN_FILE", + 77: "CANNOT_CLOSE_FILE", + 78: "UNKNOWN_TYPE_OF_QUERY", + 79: "INCORRECT_FILE_NAME", + 80: "INCORRECT_QUERY", + 81: "UNKNOWN_DATABASE", + 82: "DATABASE_ALREADY_EXISTS", + 83: "DIRECTORY_DOESNT_EXIST", + 84: "DIRECTORY_ALREADY_EXISTS", + 85: "FORMAT_IS_NOT_SUITABLE_FOR_INPUT", + 86: "RECEIVED_ERROR_FROM_REMOTE_IO_SERVER", + 87: "CANNOT_SEEK_THROUGH_FILE", + 88: "CANNOT_TRUNCATE_FILE", + 89: "UNKNOWN_COMPRESSION_METHOD", + 90: "EMPTY_LIST_OF_COLUMNS_PASSED", + 91: "SIZES_OF_MARKS_FILES_ARE_INCONSISTENT", + 92: "EMPTY_DATA_PASSED", + 93: "UNKNOWN_AGGREGATED_DATA_VARIANT", + 94: "CANNOT_MERGE_DIFFERENT_AGGREGATED_DATA_VARIANTS", + 95: "CANNOT_READ_FROM_SOCKET", + 96: "CANNOT_WRITE_TO_SOCKET", + 97: "CANNOT_READ_ALL_DATA_FROM_CHUNKED_INPUT", + 98: "CANNOT_WRITE_TO_EMPTY_BLOCK_OUTPUT_STREAM", + 99: "UNKNOWN_PACKET_FROM_CLIENT", + 100: "UNKNOWN_PACKET_FROM_SERVER", + 101: "UNEXPECTED_PACKET_FROM_CLIENT", + 102: "UNEXPECTED_PACKET_FROM_SERVER", + 103: "RECEIVED_DATA_FOR_WRONG_QUERY_ID", + 104: "TOO_SMALL_BUFFER_SIZE", + 105: "CANNOT_READ_HISTORY", + 106: "CANNOT_APPEND_HISTORY", + 107: "FILE_DOESNT_EXIST", + 108: "NO_DATA_TO_INSERT", + 109: "CANNOT_BLOCK_SIGNAL", + 110: "CANNOT_UNBLOCK_SIGNAL", + 111: "CANNOT_MANIPULATE_SIGSET", + 112: "CANNOT_WAIT_FOR_SIGNAL", + 113: "THERE_IS_NO_SESSION", + 114: "CANNOT_CLOCK_GETTIME", + 115: "UNKNOWN_SETTING", + 116: "THERE_IS_NO_DEFAULT_VALUE", + 117: "INCORRECT_DATA", + 119: "ENGINE_REQUIRED", + 120: "CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE", + 121: "UNSUPPORTED_JOIN_KEYS", + 122: "INCOMPATIBLE_COLUMNS", + 123: "UNKNOWN_TYPE_OF_AST_NODE", + 124: "INCORRECT_ELEMENT_OF_SET", + 125: "INCORRECT_RESULT_OF_SCALAR_SUBQUERY", + 126: "CANNOT_GET_RETURN_TYPE", + 127: "ILLEGAL_INDEX", + 128: "TOO_LARGE_ARRAY_SIZE", + 129: "FUNCTION_IS_SPECIAL", + 130: "CANNOT_READ_ARRAY_FROM_TEXT", + 131: "TOO_LARGE_STRING_SIZE", + 133: "AGGREGATE_FUNCTION_DOESNT_ALLOW_PARAMETERS", + 134: "PARAMETERS_TO_AGGREGATE_FUNCTIONS_MUST_BE_LITERALS", + 135: "ZERO_ARRAY_OR_TUPLE_INDEX", + 137: "UNKNOWN_ELEMENT_IN_CONFIG", + 138: "EXCESSIVE_ELEMENT_IN_CONFIG", + 139: "NO_ELEMENTS_IN_CONFIG", + 140: "ALL_REQUESTED_COLUMNS_ARE_MISSING", + 141: "SAMPLING_NOT_SUPPORTED", + 142: "NOT_FOUND_NODE", + 143: "FOUND_MORE_THAN_ONE_NODE", + 144: "FIRST_DATE_IS_BIGGER_THAN_LAST_DATE", + 145: "UNKNOWN_OVERFLOW_MODE", + 146: "QUERY_SECTION_DOESNT_MAKE_SENSE", + 147: "NOT_FOUND_FUNCTION_ELEMENT_FOR_AGGREGATE", + 148: "NOT_FOUND_RELATION_ELEMENT_FOR_CONDITION", + 149: "NOT_FOUND_RHS_ELEMENT_FOR_CONDITION", + 150: "EMPTY_LIST_OF_ATTRIBUTES_PASSED", + 151: "INDEX_OF_COLUMN_IN_SORT_CLAUSE_IS_OUT_OF_RANGE", + 152: "UNKNOWN_DIRECTION_OF_SORTING", + 153: "ILLEGAL_DIVISION", + 154: "AGGREGATE_FUNCTION_NOT_APPLICABLE", + 155: "UNKNOWN_RELATION", + 156: "DICTIONARIES_WAS_NOT_LOADED", + 157: "ILLEGAL_OVERFLOW_MODE", + 158: "TOO_MANY_ROWS", + 159: "TIMEOUT_EXCEEDED", + 160: "TOO_SLOW", + 161: "TOO_MANY_COLUMNS", + 162: "TOO_DEEP_SUBQUERIES", + 163: "TOO_DEEP_PIPELINE", + 164: "READONLY", + 165: "TOO_MANY_TEMPORARY_COLUMNS", + 166: "TOO_MANY_TEMPORARY_NON_CONST_COLUMNS", + 167: "TOO_DEEP_AST", + 168: "TOO_BIG_AST", + 169: "BAD_TYPE_OF_FIELD", + 170: "BAD_GET", + 172: "CANNOT_CREATE_DIRECTORY", + 173: "CANNOT_ALLOCATE_MEMORY", + 174: "CYCLIC_ALIASES", + 176: "CHUNK_NOT_FOUND", + 177: "DUPLICATE_CHUNK_NAME", + 178: "MULTIPLE_ALIASES_FOR_EXPRESSION", + 179: "MULTIPLE_EXPRESSIONS_FOR_ALIAS", + 180: "THERE_IS_NO_PROFILE", + 181: "ILLEGAL_FINAL", + 182: "ILLEGAL_PREWHERE", + 183: "UNEXPECTED_EXPRESSION", + 184: "ILLEGAL_AGGREGATION", + 185: "UNSUPPORTED_MYISAM_BLOCK_TYPE", + 186: "UNSUPPORTED_COLLATION_LOCALE", + 187: "COLLATION_COMPARISON_FAILED", + 188: "UNKNOWN_ACTION", + 189: "TABLE_MUST_NOT_BE_CREATED_MANUALLY", + 190: "SIZES_OF_ARRAYS_DOESNT_MATCH", + 191: "SET_SIZE_LIMIT_EXCEEDED", + 192: "UNKNOWN_USER", + 193: "WRONG_PASSWORD", + 194: "REQUIRED_PASSWORD", + 195: "IP_ADDRESS_NOT_ALLOWED", + 196: "UNKNOWN_ADDRESS_PATTERN_TYPE", + 197: "SERVER_REVISION_IS_TOO_OLD", + 198: "DNS_ERROR", + 199: "UNKNOWN_QUOTA", + 200: "QUOTA_DOESNT_ALLOW_KEYS", + 201: "QUOTA_EXPIRED", + 202: "TOO_MANY_SIMULTANEOUS_QUERIES", + 203: "NO_FREE_CONNECTION", + 204: "CANNOT_FSYNC", + 205: "NESTED_TYPE_TOO_DEEP", + 206: "ALIAS_REQUIRED", + 207: "AMBIGUOUS_IDENTIFIER", + 208: "EMPTY_NESTED_TABLE", + 209: "SOCKET_TIMEOUT", + 210: "NETWORK_ERROR", + 211: "EMPTY_QUERY", + 212: "UNKNOWN_LOAD_BALANCING", + 213: "UNKNOWN_TOTALS_MODE", + 214: "CANNOT_STATVFS", + 215: "NOT_AN_AGGREGATE", + 216: "QUERY_WITH_SAME_ID_IS_ALREADY_RUNNING", + 217: "CLIENT_HAS_CONNECTED_TO_WRONG_PORT", + 218: "TABLE_IS_DROPPED", + 219: "DATABASE_NOT_EMPTY", + 220: "DUPLICATE_INTERSERVER_IO_ENDPOINT", + 221: "NO_SUCH_INTERSERVER_IO_ENDPOINT", + 222: "ADDING_REPLICA_TO_NON_EMPTY_TABLE", + 223: "UNEXPECTED_AST_STRUCTURE", + 224: "REPLICA_IS_ALREADY_ACTIVE", + 225: "NO_ZOOKEEPER", + 226: "NO_FILE_IN_DATA_PART", + 227: "UNEXPECTED_FILE_IN_DATA_PART", + 228: "BAD_SIZE_OF_FILE_IN_DATA_PART", + 229: "QUERY_IS_TOO_LARGE", + 230: "NOT_FOUND_EXPECTED_DATA_PART", + 231: "TOO_MANY_UNEXPECTED_DATA_PARTS", + 232: "NO_SUCH_DATA_PART", + 233: "BAD_DATA_PART_NAME", + 234: "NO_REPLICA_HAS_PART", + 235: "DUPLICATE_DATA_PART", + 236: "ABORTED", + 237: "NO_REPLICA_NAME_GIVEN", + 238: "FORMAT_VERSION_TOO_OLD", + 239: "CANNOT_MUNMAP", + 240: "CANNOT_MREMAP", + 241: "MEMORY_LIMIT_EXCEEDED", + 242: "TABLE_IS_READ_ONLY", + 243: "NOT_ENOUGH_SPACE", + 244: "UNEXPECTED_ZOOKEEPER_ERROR", + 246: "CORRUPTED_DATA", + 247: "INCORRECT_MARK", + 248: "INVALID_PARTITION_VALUE", + 250: "NOT_ENOUGH_BLOCK_NUMBERS", + 251: "NO_SUCH_REPLICA", + 252: "TOO_MANY_PARTS", + 253: "REPLICA_IS_ALREADY_EXIST", + 254: "NO_ACTIVE_REPLICAS", + 255: "TOO_MANY_RETRIES_TO_FETCH_PARTS", + 256: "PARTITION_ALREADY_EXISTS", + 257: "PARTITION_DOESNT_EXIST", + 258: "UNION_ALL_RESULT_STRUCTURES_MISMATCH", + 260: "CLIENT_OUTPUT_FORMAT_SPECIFIED", + 261: "UNKNOWN_BLOCK_INFO_FIELD", + 262: "BAD_COLLATION", + 263: "CANNOT_COMPILE_CODE", + 264: "INCOMPATIBLE_TYPE_OF_JOIN", + 265: "NO_AVAILABLE_REPLICA", + 266: "MISMATCH_REPLICAS_DATA_SOURCES", + 267: "STORAGE_DOESNT_SUPPORT_PARALLEL_REPLICAS", + 268: "CPUID_ERROR", + 269: "INFINITE_LOOP", + 270: "CANNOT_COMPRESS", + 271: "CANNOT_DECOMPRESS", + 272: "CANNOT_IO_SUBMIT", + 273: "CANNOT_IO_GETEVENTS", + 274: "AIO_READ_ERROR", + 275: "AIO_WRITE_ERROR", + 277: "INDEX_NOT_USED", + 279: "ALL_CONNECTION_TRIES_FAILED", + 280: "NO_AVAILABLE_DATA", + 281: "DICTIONARY_IS_EMPTY", + 282: "INCORRECT_INDEX", + 283: "UNKNOWN_DISTRIBUTED_PRODUCT_MODE", + 284: "WRONG_GLOBAL_SUBQUERY", + 285: "TOO_FEW_LIVE_REPLICAS", + 286: "UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE", + 287: "UNKNOWN_FORMAT_VERSION", + 288: "DISTRIBUTED_IN_JOIN_SUBQUERY_DENIED", + 289: "REPLICA_IS_NOT_IN_QUORUM", + 290: "LIMIT_EXCEEDED", + 291: "DATABASE_ACCESS_DENIED", + 293: "MONGODB_CANNOT_AUTHENTICATE", + 294: "INVALID_BLOCK_EXTRA_INFO", + 295: "RECEIVED_EMPTY_DATA", + 296: "NO_REMOTE_SHARD_FOUND", + 297: "SHARD_HAS_NO_CONNECTIONS", + 298: "CANNOT_PIPE", + 299: "CANNOT_FORK", + 300: "CANNOT_DLSYM", + 301: "CANNOT_CREATE_CHILD_PROCESS", + 302: "CHILD_WAS_NOT_EXITED_NORMALLY", + 303: "CANNOT_SELECT", + 304: "CANNOT_WAITPID", + 305: "TABLE_WAS_NOT_DROPPED", + 306: "TOO_DEEP_RECURSION", + 307: "TOO_MANY_BYTES", + 308: "UNEXPECTED_NODE_IN_ZOOKEEPER", + 309: "FUNCTION_CANNOT_HAVE_PARAMETERS", + 317: "INVALID_SHARD_WEIGHT", + 318: "INVALID_CONFIG_PARAMETER", + 319: "UNKNOWN_STATUS_OF_INSERT", + 321: "VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE", + 335: "BARRIER_TIMEOUT", + 336: "UNKNOWN_DATABASE_ENGINE", + 337: "DDL_GUARD_IS_ACTIVE", + 341: "UNFINISHED", + 342: "METADATA_MISMATCH", + 344: "SUPPORT_IS_DISABLED", + 345: "TABLE_DIFFERS_TOO_MUCH", + 346: "CANNOT_CONVERT_CHARSET", + 347: "CANNOT_LOAD_CONFIG", + 349: "CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN", + 350: "INCOMPATIBLE_SOURCE_TABLES", + 351: "AMBIGUOUS_TABLE_NAME", + 352: "AMBIGUOUS_COLUMN_NAME", + 353: "INDEX_OF_POSITIONAL_ARGUMENT_IS_OUT_OF_RANGE", + 354: "ZLIB_INFLATE_FAILED", + 355: "ZLIB_DEFLATE_FAILED", + 356: "BAD_LAMBDA", + 357: "RESERVED_IDENTIFIER_NAME", + 358: "INTO_OUTFILE_NOT_ALLOWED", + 359: "TABLE_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT", + 360: "CANNOT_CREATE_CHARSET_CONVERTER", + 361: "SEEK_POSITION_OUT_OF_BOUND", + 362: "CURRENT_WRITE_BUFFER_IS_EXHAUSTED", + 363: "CANNOT_CREATE_IO_BUFFER", + 364: "RECEIVED_ERROR_TOO_MANY_REQUESTS", + 366: "SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT", + 367: "TOO_MANY_FETCHES", + 369: "ALL_REPLICAS_ARE_STALE", + 370: "DATA_TYPE_CANNOT_BE_USED_IN_TABLES", + 371: "INCONSISTENT_CLUSTER_DEFINITION", + 372: "SESSION_NOT_FOUND", + 373: "SESSION_IS_LOCKED", + 374: "INVALID_SESSION_TIMEOUT", + 375: "CANNOT_DLOPEN", + 376: "CANNOT_PARSE_UUID", + 377: "ILLEGAL_SYNTAX_FOR_DATA_TYPE", + 378: "DATA_TYPE_CANNOT_HAVE_ARGUMENTS", + 379: "UNKNOWN_STATUS_OF_DISTRIBUTED_DDL_TASK", + 380: "CANNOT_KILL", + 381: "HTTP_LENGTH_REQUIRED", + 382: "CANNOT_LOAD_CATBOOST_MODEL", + 383: "CANNOT_APPLY_CATBOOST_MODEL", + 384: "PART_IS_TEMPORARILY_LOCKED", + 385: "MULTIPLE_STREAMS_REQUIRED", + 386: "NO_COMMON_TYPE", + 387: "DICTIONARY_ALREADY_EXISTS", + 388: "CANNOT_ASSIGN_OPTIMIZE", + 389: "INSERT_WAS_DEDUPLICATED", + 390: "CANNOT_GET_CREATE_TABLE_QUERY", + 391: "EXTERNAL_LIBRARY_ERROR", + 392: "QUERY_IS_PROHIBITED", + 393: "THERE_IS_NO_QUERY", + 394: "QUERY_WAS_CANCELLED", + 395: "FUNCTION_THROW_IF_VALUE_IS_NON_ZERO", + 396: "TOO_MANY_ROWS_OR_BYTES", + 397: "QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW", + 398: "UNKNOWN_MUTATION_COMMAND", + 399: "FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT", + 400: "CANNOT_STAT", + 401: "FEATURE_IS_NOT_ENABLED_AT_BUILD_TIME", + 402: "CANNOT_IOSETUP", + 403: "INVALID_JOIN_ON_EXPRESSION", + 404: "BAD_ODBC_CONNECTION_STRING", + 405: "PARTITION_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT", + 406: "TOP_AND_LIMIT_TOGETHER", + 407: "DECIMAL_OVERFLOW", + 408: "BAD_REQUEST_PARAMETER", + 409: "EXTERNAL_EXECUTABLE_NOT_FOUND", + 410: "EXTERNAL_SERVER_IS_NOT_RESPONDING", + 411: "PTHREAD_ERROR", + 412: "NETLINK_ERROR", + 413: "CANNOT_SET_SIGNAL_HANDLER", + 415: "ALL_REPLICAS_LOST", + 416: "REPLICA_STATUS_CHANGED", + 417: "EXPECTED_ALL_OR_ANY", + 418: "UNKNOWN_JOIN", + 419: "MULTIPLE_ASSIGNMENTS_TO_COLUMN", + 420: "CANNOT_UPDATE_COLUMN", + 421: "CANNOT_ADD_DIFFERENT_AGGREGATE_STATES", + 422: "UNSUPPORTED_URI_SCHEME", + 423: "CANNOT_GETTIMEOFDAY", + 424: "CANNOT_LINK", + 425: "SYSTEM_ERROR", + 427: "CANNOT_COMPILE_REGEXP", + 428: "UNKNOWN_LOG_LEVEL", + 429: "FAILED_TO_GETPWUID", + 430: "MISMATCHING_USERS_FOR_PROCESS_AND_DATA", + 431: "ILLEGAL_SYNTAX_FOR_CODEC_TYPE", + 432: "UNKNOWN_CODEC", + 433: "ILLEGAL_CODEC_PARAMETER", + 434: "CANNOT_PARSE_PROTOBUF_SCHEMA", + 435: "NO_COLUMN_SERIALIZED_TO_REQUIRED_PROTOBUF_FIELD", + 436: "PROTOBUF_BAD_CAST", + 437: "PROTOBUF_FIELD_NOT_REPEATED", + 438: "DATA_TYPE_CANNOT_BE_PROMOTED", + 439: "CANNOT_SCHEDULE_TASK", + 440: "INVALID_LIMIT_EXPRESSION", + 441: "CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING", + 442: "BAD_DATABASE_FOR_TEMPORARY_TABLE", + 443: "NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS", + 444: "UNKNOWN_PROTOBUF_FORMAT", + 445: "CANNOT_MPROTECT", + 446: "FUNCTION_NOT_ALLOWED", + 447: "HYPERSCAN_CANNOT_SCAN_TEXT", + 448: "BROTLI_READ_FAILED", + 449: "BROTLI_WRITE_FAILED", + 450: "BAD_TTL_EXPRESSION", + 451: "BAD_TTL_FILE", + 452: "SETTING_CONSTRAINT_VIOLATION", + 453: "MYSQL_CLIENT_INSUFFICIENT_CAPABILITIES", + 454: "OPENSSL_ERROR", + 455: "SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY", + 456: "UNKNOWN_QUERY_PARAMETER", + 457: "BAD_QUERY_PARAMETER", + 458: "CANNOT_UNLINK", + 459: "CANNOT_SET_THREAD_PRIORITY", + 460: "CANNOT_CREATE_TIMER", + 461: "CANNOT_SET_TIMER_PERIOD", + 462: "CANNOT_DELETE_TIMER", + 463: "CANNOT_FCNTL", + 464: "CANNOT_PARSE_ELF", + 465: "CANNOT_PARSE_DWARF", + 466: "INSECURE_PATH", + 467: "CANNOT_PARSE_BOOL", + 468: "CANNOT_PTHREAD_ATTR", + 469: "VIOLATED_CONSTRAINT", + 470: "QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW", + 471: "INVALID_SETTING_VALUE", + 472: "READONLY_SETTING", + 473: "DEADLOCK_AVOIDED", + 474: "INVALID_TEMPLATE_FORMAT", + 475: "INVALID_WITH_FILL_EXPRESSION", + 476: "WITH_TIES_WITHOUT_ORDER_BY", + 477: "INVALID_USAGE_OF_INPUT", + 478: "UNKNOWN_POLICY", + 479: "UNKNOWN_DISK", + 480: "UNKNOWN_PROTOCOL", + 481: "PATH_ACCESS_DENIED", + 482: "DICTIONARY_ACCESS_DENIED", + 483: "TOO_MANY_REDIRECTS", + 484: "INTERNAL_REDIS_ERROR", + 485: "SCALAR_ALREADY_EXISTS", + 487: "CANNOT_GET_CREATE_DICTIONARY_QUERY", + 488: "UNKNOWN_DICTIONARY", + 489: "INCORRECT_DICTIONARY_DEFINITION", + 490: "CANNOT_FORMAT_DATETIME", + 491: "UNACCEPTABLE_URL", + 492: "ACCESS_ENTITY_NOT_FOUND", + 493: "ACCESS_ENTITY_ALREADY_EXISTS", + 494: "ACCESS_ENTITY_FOUND_DUPLICATES", + 495: "ACCESS_STORAGE_READONLY", + 496: "QUOTA_REQUIRES_CLIENT_KEY", + 497: "ACCESS_DENIED", + 498: "LIMIT_BY_WITH_TIES_IS_NOT_SUPPORTED", + 499: "S3_ERROR", + 501: "CANNOT_CREATE_DATABASE", + 502: "CANNOT_SIGQUEUE", + 503: "AGGREGATE_FUNCTION_THROW", + 504: "FILE_ALREADY_EXISTS", + 505: "CANNOT_DELETE_DIRECTORY", + 506: "UNEXPECTED_ERROR_CODE", + 507: "UNABLE_TO_SKIP_UNUSED_SHARDS", + 508: "UNKNOWN_ACCESS_TYPE", + 509: "INVALID_GRANT", + 510: "CACHE_DICTIONARY_UPDATE_FAIL", + 511: "UNKNOWN_ROLE", + 512: "SET_NON_GRANTED_ROLE", + 513: "UNKNOWN_PART_TYPE", + 514: "ACCESS_STORAGE_FOR_INSERTION_NOT_FOUND", + 515: "INCORRECT_ACCESS_ENTITY_DEFINITION", + 516: "AUTHENTICATION_FAILED", + 517: "CANNOT_ASSIGN_ALTER", + 518: "CANNOT_COMMIT_OFFSET", + 519: "NO_REMOTE_SHARD_AVAILABLE", + 520: "CANNOT_DETACH_DICTIONARY_AS_TABLE", + 521: "ATOMIC_RENAME_FAIL", + 523: "UNKNOWN_ROW_POLICY", + 524: "ALTER_OF_COLUMN_IS_FORBIDDEN", + 525: "INCORRECT_DISK_INDEX", + 526: "UNKNOWN_VOLUME_TYPE", + 527: "NO_SUITABLE_FUNCTION_IMPLEMENTATION", + 528: "CASSANDRA_INTERNAL_ERROR", + 529: "NOT_A_LEADER", + 530: "CANNOT_CONNECT_RABBITMQ", + 531: "CANNOT_FSTAT", + 532: "LDAP_ERROR", + 533: "INCONSISTENT_RESERVATIONS", + 534: "NO_RESERVATIONS_PROVIDED", + 535: "UNKNOWN_RAID_TYPE", + 536: "CANNOT_RESTORE_FROM_FIELD_DUMP", + 537: "ILLEGAL_MYSQL_VARIABLE", + 538: "MYSQL_SYNTAX_ERROR", + 539: "CANNOT_BIND_RABBITMQ_EXCHANGE", + 540: "CANNOT_DECLARE_RABBITMQ_EXCHANGE", + 541: "CANNOT_CREATE_RABBITMQ_QUEUE_BINDING", + 542: "CANNOT_REMOVE_RABBITMQ_EXCHANGE", + 543: "UNKNOWN_MYSQL_DATATYPES_SUPPORT_LEVEL", + 544: "ROW_AND_ROWS_TOGETHER", + 545: "FIRST_AND_NEXT_TOGETHER", + 546: "NO_ROW_DELIMITER", + 547: "INVALID_RAID_TYPE", + 548: "UNKNOWN_VOLUME", + 549: "DATA_TYPE_CANNOT_BE_USED_IN_KEY", + 550: "CONDITIONAL_TREE_PARENT_NOT_FOUND", + 551: "ILLEGAL_PROJECTION_MANIPULATOR", + 552: "UNRECOGNIZED_ARGUMENTS", + 553: "LZMA_STREAM_ENCODER_FAILED", + 554: "LZMA_STREAM_DECODER_FAILED", + 555: "ROCKSDB_ERROR", + 556: "SYNC_MYSQL_USER_ACCESS_ERRO", + 557: "UNKNOWN_UNION", + 558: "EXPECTED_ALL_OR_DISTINCT", + 559: "INVALID_GRPC_QUERY_INFO", + 560: "ZSTD_ENCODER_FAILED", + 561: "ZSTD_DECODER_FAILED", + 562: "TLD_LIST_NOT_FOUND", + 563: "CANNOT_READ_MAP_FROM_TEXT", + 564: "INTERSERVER_SCHEME_DOESNT_MATCH", + 565: "TOO_MANY_PARTITIONS", + 566: "CANNOT_RMDIR", + 567: "DUPLICATED_PART_UUIDS", + 568: "RAFT_ERROR", + 569: "MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD", + 570: "DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD", + 571: "DATABASE_REPLICATION_FAILED", + 572: "TOO_MANY_QUERY_PLAN_OPTIMIZATIONS", + 573: "EPOLL_ERROR", + 574: "DISTRIBUTED_TOO_MANY_PENDING_BYTES", + 575: "UNKNOWN_SNAPSHOT", + 576: "KERBEROS_ERROR", + 577: "INVALID_SHARD_ID", + 578: "INVALID_FORMAT_INSERT_QUERY_WITH_DATA", + 579: "INCORRECT_PART_TYPE", + 580: "CANNOT_SET_ROUNDING_MODE", + 581: "TOO_LARGE_DISTRIBUTED_DEPTH", + 582: "NO_SUCH_PROJECTION_IN_TABLE", + 583: "ILLEGAL_PROJECTION", + 584: "PROJECTION_NOT_USED", + 585: "CANNOT_PARSE_YAML", + 586: "CANNOT_CREATE_FILE", + 587: "CONCURRENT_ACCESS_NOT_SUPPORTED", + 588: "DISTRIBUTED_BROKEN_BATCH_INFO", + 589: "DISTRIBUTED_BROKEN_BATCH_FILES", + 590: "CANNOT_SYSCONF", + 591: "SQLITE_ENGINE_ERROR", + 592: "DATA_ENCRYPTION_ERROR", + 593: "ZERO_COPY_REPLICATION_ERROR", + 998: "POSTGRESQL_CONNECTION_FAILURE", + 999: "KEEPER_EXCEPTION", + 1000: "POCO_EXCEPTION", + 1001: "STD_EXCEPTION", + 1002: "UNKNOWN_EXCEPTION", +} diff --git a/ee/clickhouse/generate_local.py b/ee/clickhouse/generate_local.py new file mode 100644 index 0000000000000..e8716b9caa7e7 --- /dev/null +++ b/ee/clickhouse/generate_local.py @@ -0,0 +1,150 @@ +import uuid +from datetime import datetime + +from dateutil.relativedelta import relativedelta +from django.db import connection + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.event import create_event +from posthog.models import EventDefinition, Person, Team + +UTC_FORMAT = "%Y-%m-%d %H:%M:%S" + + +class GenerateLocal: + team: Team + number: int + + def __init__(self, team, number=250): + self.team = team + self.number = number + + def generate(self): + self._insert_persons() + self._insert_event_definitions() + self._insert_events() + + def destroy(self): + # You'll need to manually clean up the clickhouse database by: + # 1. docker compose -f ee/docker-compose.ch.yml down clickhouse zookeeper kafka + # 2. DEBUG=1;DJANGO_SETTINGS_MODULE=posthog.settings;PRIMARY_DB=clickhouse;CLICKHOUSE_HOST=clickhouse;CLICKHOUSE_DATABASE=posthog;CLICKHOUSE_SECURE=false;CLICKHOUSE_VERIFY=false python migrate.py migrate_clickhouse + + with connection.cursor() as cursor: + cursor.execute("delete from posthog_persondistinctid where distinct_id like 'user_%%'") + cursor.execute("delete from posthog_person where properties->> 'name' like 'user_%'") + cursor.execute("delete from posthog_eventdefinition where name like 'step %'") + + def _insert_event_definitions(self): + EventDefinition.objects.get_or_create(team=self.team, name="step one") + EventDefinition.objects.get_or_create(team=self.team, name="step two") + EventDefinition.objects.get_or_create(team=self.team, name="step three") + EventDefinition.objects.get_or_create(team=self.team, name="step four") + EventDefinition.objects.get_or_create(team=self.team, name="step five") + + def _insert_persons(self): + for i in range(1, self.number + 1): + try: + person = Person.objects.create( + distinct_ids=[f"user_{i}"], team=self.team, properties={"name": f"user_{i}"} + ) + self._insert_person_distinct_ids(f"user_{i}", person.uuid) + except Exception as e: + print(str(e)) + + def _insert_person_distinct_ids(self, distinct_id, person_uuid): + sql = f""" + insert into person_distinct_id (distinct_id, person_id, team_id, _timestamp) values + ('{distinct_id}', '{person_uuid}', '{self.team.id}', now()); + """ + + sync_execute(sql) + + def _insert_events(self): + self._insert_many_events("2010-01-01 00:00:00") + self._insert_hours_events("2011-01-01 00:00:00") + self._insert_days_events("2012-01-01 00:00:00") + self._insert_weeks_events("2013-01-01 00:00:00") + self._insert_months_events("2015-01-01 00:00:00") + + def _insert_many_events(self, start_date): + step_one = self.number + 1 + step_two = round(step_one / 2) + step_three = round(step_one / 3) + step_four = round(step_one / 4) + step_five = round(step_one / 5) + + for i in range(1, step_one): + create_event(uuid.uuid4(), "step one", self.team, f"user_{i}", start_date) + for i in range(1, step_two): + create_event(uuid.uuid4(), "step two", self.team, f"user_{i}", self._add_interval("days", 1, start_date)) + for i in range(1, step_three): + create_event(uuid.uuid4(), "step three", self.team, f"user_{i}", self._add_interval("days", 2, start_date)) + for i in range(1, step_four): + create_event(uuid.uuid4(), "step four", self.team, f"user_{i}", self._add_interval("days", 3, start_date)) + for i in range(1, step_five): + create_event(uuid.uuid4(), "step five", self.team, f"user_{i}", self._add_interval("days", 4, start_date)) + + def _insert_hours_events(self, start_date): + self._case_correct_order("hours", "user_1", start_date) + self._case_reverse_order("hours", "user_2", start_date) + self._case_out_of_order_complete("hours", "user_3", start_date) + + def _insert_days_events(self, start_date): + self._case_correct_order("days", "user_11", start_date) + self._case_reverse_order("days", "user_12", start_date) + self._case_out_of_order_complete("days", "user_13", start_date) + + def _insert_weeks_events(self, start_date): + self._case_correct_order("weeks", "user_21", start_date) + self._case_reverse_order("weeks", "user_22", start_date) + self._case_out_of_order_complete("weeks", "user_23", start_date) + + def _insert_months_events(self, start_date): + self._case_correct_order("months", "user_31", start_date) + self._case_reverse_order("months", "user_32", start_date) + self._case_out_of_order_complete("months", "user_33", start_date) + + def _case_correct_order(self, interval, user, start_date): + create_event(uuid.uuid4(), "step one", self.team, user, start_date) + create_event(uuid.uuid4(), "step two", self.team, user, self._add_interval(interval, 1, start_date)) + create_event(uuid.uuid4(), "step three", self.team, user, self._add_interval(interval, 2, start_date)) + create_event(uuid.uuid4(), "step four", self.team, user, self._add_interval(interval, 3, start_date)) + create_event(uuid.uuid4(), "step five", self.team, user, self._add_interval(interval, 4, start_date)) + + def _case_reverse_order(self, interval, user, start_date): + create_event(uuid.uuid4(), "step five", self.team, user, start_date) + create_event(uuid.uuid4(), "step four", self.team, user, self._add_interval(interval, 1, start_date)) + create_event(uuid.uuid4(), "step three", self.team, user, self._add_interval(interval, 2, start_date)) + create_event(uuid.uuid4(), "step two", self.team, user, self._add_interval(interval, 3, start_date)) + create_event(uuid.uuid4(), "step one", self.team, user, self._add_interval(interval, 4, start_date)) + + def _case_out_of_order_complete(self, interval, user, start_date): + create_event(uuid.uuid4(), "step one", self.team, user, start_date) + create_event(uuid.uuid4(), "step three", self.team, user, self._add_interval(interval, 1, start_date)) + create_event(uuid.uuid4(), "step two", self.team, user, self._add_interval(interval, 1, start_date)) + create_event(uuid.uuid4(), "step three", self.team, user, self._add_interval(interval, 2, start_date)) + create_event(uuid.uuid4(), "step five", self.team, user, self._add_interval(interval, 3, start_date)) + create_event(uuid.uuid4(), "step four", self.team, user, self._add_interval(interval, 3, start_date)) + create_event(uuid.uuid4(), "step five", self.team, user, self._add_interval(interval, 4, start_date)) + + def _add_interval(self, interval, delta, date_time_string): + dt = datetime.strptime(date_time_string, UTC_FORMAT) + + if interval == "months": + delta = relativedelta(months=delta) + new_dt = dt + delta + return new_dt.strftime(UTC_FORMAT) + elif interval == "weeks": + delta = relativedelta(weeks=delta) + new_dt = dt + delta + return new_dt.strftime(UTC_FORMAT) + elif interval == "days": + delta = relativedelta(days=delta) + new_dt = dt + delta + return new_dt.strftime(UTC_FORMAT) + elif interval == "hours": + delta = relativedelta(hours=delta) + new_dt = dt + delta + return new_dt.strftime(UTC_FORMAT) + else: + return date_time_string diff --git a/ee/clickhouse/materialized_columns/__init__.py b/ee/clickhouse/materialized_columns/__init__.py new file mode 100644 index 0000000000000..048f9bbfc75e6 --- /dev/null +++ b/ee/clickhouse/materialized_columns/__init__.py @@ -0,0 +1,2 @@ +from .analyze import analyze, get_queries, materialize_properties_task +from .columns import backfill_materialized_columns, get_materialized_columns, materialize, materialized_column_name diff --git a/ee/clickhouse/materialized_columns/analyze.py b/ee/clickhouse/materialized_columns/analyze.py new file mode 100644 index 0000000000000..bc04fac1c7e2c --- /dev/null +++ b/ee/clickhouse/materialized_columns/analyze.py @@ -0,0 +1,161 @@ +import logging +import re +from collections import defaultdict +from datetime import timedelta +from typing import Dict, Generator, List, Optional, Set, Tuple + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns.columns import ( + backfill_materialized_columns, + get_materialized_columns, + materialize, +) +from ee.clickhouse.materialized_columns.util import instance_memoize +from ee.clickhouse.sql.person import GET_PERSON_PROPERTIES_COUNT +from ee.settings import ( + MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, + MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, + MATERIALIZE_COLUMNS_MAX_AT_ONCE, + MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, +) +from posthog.models.filters.mixins.utils import cached_property +from posthog.models.property import PropertyName, TableWithProperties +from posthog.models.property_definition import PropertyDefinition +from posthog.models.team import Team + +Suggestion = Tuple[TableWithProperties, PropertyName, int] + +logger = logging.getLogger(__name__) + + +class TeamManager: + @instance_memoize + def person_properties(self, team_id: str) -> Set[str]: + rows = sync_execute(GET_PERSON_PROPERTIES_COUNT, {"team_id": team_id}) + return set(name for name, _ in rows) + + @instance_memoize + def event_properties(self, team_id: str) -> Set[str]: + return set(PropertyDefinition.objects.filter(team_id=team_id).values_list("name", flat=True)) + + +class Query: + def __init__(self, query_string: str, query_time_ms: float, min_query_time=MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME): + self.query_string = query_string + self.query_time_ms = query_time_ms + self.min_query_time = min_query_time + + @property + def cost(self) -> int: + return int((self.query_time_ms - self.min_query_time) / 1000) + 1 + + @cached_property + def is_valid(self): + return self.team_id is not None and Team.objects.filter(pk=self.team_id).exists() + + @cached_property + def team_id(self) -> Optional[str]: + matches = re.findall(r"team_id = (\d+)", self.query_string) + return matches[0] if matches else None + + @cached_property + def _all_properties(self) -> List[PropertyName]: + return re.findall(r"JSONExtract\w+\(\S+, '([^']+)'\)", self.query_string) + + def properties(self, team_manager: TeamManager) -> Generator[Tuple[TableWithProperties, PropertyName], None, None]: + # Reverse-engineer whether a property is an "event" or "person" property by getting their event definitions. + # :KLUDGE: Note that the same property will be found on both tables if both are used. + person_props = team_manager.person_properties(self.team_id) + event_props = team_manager.event_properties(self.team_id) + for property in self._all_properties: + if property in person_props: + yield "person", property + if property in event_props: + yield "events", property + + +def get_queries(since_hours_ago: int, min_query_time: int) -> List[Query]: + "Finds queries that have happened since cutoff that were slow" + + raw_queries = sync_execute( + f""" + SELECT + query, + query_duration_ms + FROM system.query_log + WHERE + query NOT LIKE '%%query_log%%' + AND query LIKE '/* request:%%' + AND query NOT LIKE '%%INSERT%%' + AND type = 'QueryFinish' + AND query_start_time > now() - toIntervalHour(%(since)s) + AND query_duration_ms > %(min_query_time)s + ORDER BY query_duration_ms desc + """, + {"since": since_hours_ago, "min_query_time": min_query_time}, + ) + return [Query(query, query_duration_ms, min_query_time) for query, query_duration_ms in raw_queries] + + +def analyze(queries: List[Query]) -> List[Suggestion]: + """ + Analyzes query history to find which properties could get materialized. + + Returns an ordered list of suggestions by cost. + """ + + team_manager = TeamManager() + costs: defaultdict = defaultdict(int) + + for query in queries: + if not query.is_valid: + continue + + for table, property in query.properties(team_manager): + costs[(table, property)] += query.cost + + return [ + (table, property_name, cost) for (table, property_name), cost in sorted(costs.items(), key=lambda kv: -kv[1]) + ] + + +def materialize_properties_task( + columns_to_materialize: Optional[List[Suggestion]] = None, + time_to_analyze_hours: int = MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, + maximum: int = MATERIALIZE_COLUMNS_MAX_AT_ONCE, + min_query_time: int = MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, + backfill_period_days: int = MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, + dry_run: bool = False, +) -> None: + """ + Creates materialized columns for event and person properties based off of slow queries + """ + + if columns_to_materialize is None: + columns_to_materialize = analyze(get_queries(time_to_analyze_hours, min_query_time)) + result = [] + for suggestion in columns_to_materialize: + table, property_name, _ = suggestion + if property_name not in get_materialized_columns(table): + result.append(suggestion) + + if len(result) > 0: + logger.info(f"Calculated columns that could be materialized. count={len(result)}") + else: + logger.info("Found no columns to materialize.") + + properties: Dict[TableWithProperties, List[PropertyName]] = { + "events": [], + "person": [], + } + for table, property_name, cost in result[:maximum]: + logger.info(f"Materializing column. table={table}, property_name={property_name}, cost={cost}") + + if not dry_run: + materialize(table, property_name) + properties[table].append(property_name) + + if backfill_period_days > 0 and not dry_run: + logger.info(f"Starting backfill for new materialized columns. period_days={backfill_period_days}") + backfill_materialized_columns("events", properties["events"], timedelta(days=backfill_period_days)) + backfill_materialized_columns("person", properties["person"], timedelta(days=backfill_period_days)) diff --git a/ee/clickhouse/materialized_columns/columns.py b/ee/clickhouse/materialized_columns/columns.py new file mode 100644 index 0000000000000..4df2be02e8ce9 --- /dev/null +++ b/ee/clickhouse/materialized_columns/columns.py @@ -0,0 +1,154 @@ +import re +from datetime import timedelta +from typing import Dict, List + +from django.utils.timezone import now + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns.util import cache_for +from posthog.models.property import PropertyName, TableWithProperties +from posthog.models.utils import generate_random_short_suffix +from posthog.settings import ( + CLICKHOUSE_CLUSTER, + CLICKHOUSE_DATABASE, + CLICKHOUSE_REPLICATION, + MATERIALIZED_COLUMNS_ENABLED, + TEST, +) + +ColumnName = str + +TRIM_AND_EXTRACT_PROPERTY = "trim(BOTH '\"' FROM JSONExtractRaw(properties, %(property)s))" + + +@cache_for(timedelta(minutes=15)) +def get_materialized_columns(table: TableWithProperties) -> Dict[PropertyName, ColumnName]: + rows = sync_execute( + """ + SELECT comment, name + FROM system.columns + WHERE database = %(database)s + AND table = %(table)s + AND comment LIKE '%%column_materializer::%%' + """, + {"database": CLICKHOUSE_DATABASE, "table": table}, + ) + if rows and MATERIALIZED_COLUMNS_ENABLED: + return {extract_property(comment): column_name for comment, column_name in rows} + else: + return {} + + +def materialize(table: TableWithProperties, property: PropertyName, column_name=None) -> None: + if property in get_materialized_columns(table, use_cache=False): + if TEST: + return + + raise ValueError(f"Property already materialized. table={table}, property={property}") + + column_name = column_name or materialized_column_name(table, property) + # :TRICKY: On cloud, we ON CLUSTER updates to events/sharded_events but not to persons. Why? ¯\_(ツ)_/¯ + execute_on_cluster = f"ON CLUSTER {CLICKHOUSE_CLUSTER}" if table == "events" else "" + + if CLICKHOUSE_REPLICATION and table == "events": + sync_execute( + f""" + ALTER TABLE sharded_{table} + {execute_on_cluster} + ADD COLUMN IF NOT EXISTS + {column_name} VARCHAR MATERIALIZED {TRIM_AND_EXTRACT_PROPERTY} + """, + {"property": property}, + ) + sync_execute( + f""" + ALTER TABLE {table} + {execute_on_cluster} + ADD COLUMN IF NOT EXISTS + {column_name} VARCHAR + """ + ) + else: + sync_execute( + f""" + ALTER TABLE {table} + {execute_on_cluster} + ADD COLUMN IF NOT EXISTS + {column_name} VARCHAR MATERIALIZED {TRIM_AND_EXTRACT_PROPERTY} + """, + {"property": property}, + ) + + sync_execute( + f"ALTER TABLE {table} {execute_on_cluster} COMMENT COLUMN {column_name} %(comment)s", + {"comment": f"column_materializer::{property}"}, + ) + + +def backfill_materialized_columns( + table: TableWithProperties, properties: List[PropertyName], backfill_period: timedelta, test_settings=None +) -> None: + """ + Backfills the materialized column after its creation. + + This will require reading and writing a lot of data on clickhouse disk. + """ + + if len(properties) == 0: + return + + updated_table = "sharded_events" if CLICKHOUSE_REPLICATION and table == "events" else table + # :TRICKY: On cloud, we ON CLUSTER updates to events/sharded_events but not to persons. Why? ¯\_(ツ)_/¯ + execute_on_cluster = f"ON CLUSTER {CLICKHOUSE_CLUSTER}" if table == "events" else "" + + materialized_columns = get_materialized_columns(table, use_cache=False) + + # Hack from https://github.com/ClickHouse/ClickHouse/issues/19785 + # Note that for this to work all inserts should list columns explicitly + # Improve this if https://github.com/ClickHouse/ClickHouse/issues/27730 ever gets resolved + for property in properties: + sync_execute( + f""" + ALTER TABLE {updated_table} + {execute_on_cluster} + MODIFY COLUMN + {materialized_columns[property]} VARCHAR DEFAULT {TRIM_AND_EXTRACT_PROPERTY} + """, + {"property": property}, + settings=test_settings, + ) + + # Kick off mutations which will update clickhouse partitions in the background. This will return immediately + assignments = ", ".join( + f"{materialized_columns[property]} = {materialized_columns[property]}" for property in properties + ) + + sync_execute( + f""" + ALTER TABLE {updated_table} + {execute_on_cluster} + UPDATE {assignments} + WHERE {"timestamp > %(cutoff)s" if table == "events" else "1 = 1"} + """, + {"cutoff": (now() - backfill_period).strftime("%Y-%m-%d")}, + settings=test_settings, + ) + + +def materialized_column_name(table: TableWithProperties, property: PropertyName) -> str: + "Returns a sanitized and unique column name to use for materialized column" + + prefix = "mat_" if table == "events" else "pmat_" + property_str = re.sub("[^0-9a-zA-Z$]", "_", property) + + existing_materialized_columns = set(get_materialized_columns(table, use_cache=False).values()) + suffix = "" + + while f"{prefix}{property_str}{suffix}" in existing_materialized_columns: + suffix = "_" + generate_random_short_suffix() + + return f"{prefix}{property_str}{suffix}" + + +def extract_property(comment: str) -> PropertyName: + return comment.split("::", 1)[1] diff --git a/ee/clickhouse/materialized_columns/test/__init__.py b/ee/clickhouse/materialized_columns/test/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ee/clickhouse/materialized_columns/test/test_analyze.py b/ee/clickhouse/materialized_columns/test/test_analyze.py new file mode 100644 index 0000000000000..f9ad079684399 --- /dev/null +++ b/ee/clickhouse/materialized_columns/test/test_analyze.py @@ -0,0 +1,55 @@ +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.materialized_columns.analyze import Query, TeamManager, analyze +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.models import Person, PropertyDefinition +from posthog.test.base import BaseTest + + +class TestMaterializedColumnsAnalyze(ClickhouseTestMixin, BaseTest): + def setUp(self): + super().setUp() + self.DUMMY_QUERIES = [ + ( + f"SELECT JSONExtractString(properties, 'event_prop') FROM events WHERE team_id = {self.team.pk} AND trim(BOTH '\"' FROM JSONExtractRaw(properties, 'another_prop')", + 6723, + ), + (f"SELECT JSONExtractString(properties, 'person_prop') FROM person WHERE team_id = {self.team.pk}", 9723), + ] + + # Create property definitions + PropertyDefinition.objects.create(team=self.team, name="event_prop") + PropertyDefinition.objects.create(team=self.team, name="another_prop") + + Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"person_prop": "something", "$another_prop": "something"}, + ) + + def test_query_class(self): + with self.settings(MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME=3000): + event_query = Query(*self.DUMMY_QUERIES[0]) + person_query = Query(*self.DUMMY_QUERIES[1]) + + self.assertTrue(event_query.is_valid) + self.assertTrue(person_query.is_valid) + + self.assertEqual(event_query.team_id, str(self.team.pk)) + self.assertEqual(person_query.team_id, str(self.team.pk)) + + self.assertEqual( + list(event_query.properties(TeamManager())), [("events", "event_prop"), ("events", "another_prop")] + ) + self.assertEqual(list(person_query.properties(TeamManager())), [("person", "person_prop")]) + self.assertEqual(event_query.cost, 4) + self.assertEqual(person_query.cost, 7) + + def test_query_class_edge_cases(self): + invalid_query = Query("SELECT * FROM events WHERE team_id = -1", 100) + self.assertFalse(invalid_query.is_valid) + self.assertIsNone(invalid_query.team_id) + + query_with_unknown_property = Query( + f"SELECT JSONExtractString(properties, '$unknown_prop') FROM events WHERE team_id = {self.team.pk}", 0 + ) + self.assertEqual(list(query_with_unknown_property.properties(TeamManager())), []) diff --git a/ee/clickhouse/materialized_columns/test/test_columns.py b/ee/clickhouse/materialized_columns/test/test_columns.py new file mode 100644 index 0000000000000..f1bc3c93e5d84 --- /dev/null +++ b/ee/clickhouse/materialized_columns/test/test_columns.py @@ -0,0 +1,184 @@ +import random +from datetime import timedelta +from time import sleep +from uuid import uuid4 + +from freezegun import freeze_time + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns.columns import ( + backfill_materialized_columns, + get_materialized_columns, + materialize, +) +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseDestroyTablesMixin, ClickhouseTestMixin +from ee.tasks.materialized_columns import mark_all_materialized +from posthog.constants import GROUP_TYPES_LIMIT +from posthog.settings import CLICKHOUSE_DATABASE +from posthog.test.base import BaseTest + +GROUPS_COLUMNS = [f"$group_{i}" for i in range(GROUP_TYPES_LIMIT)] + + +def _create_event(**kwargs): + pk = uuid4() + kwargs.update({"event_uuid": pk}) + create_event(**kwargs) + return pk + + +class TestMaterializedColumns(ClickhouseTestMixin, ClickhouseDestroyTablesMixin, BaseTest): + def test_get_columns_default(self): + self.assertCountEqual(get_materialized_columns("events"), GROUPS_COLUMNS) + self.assertCountEqual(get_materialized_columns("person"), []) + + def test_caching_and_materializing(self): + with freeze_time("2020-01-04T13:01:01Z"): + materialize("events", "$foo") + materialize("events", "$bar") + materialize("person", "$zeta") + + self.assertCountEqual( + get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *GROUPS_COLUMNS] + ) + self.assertCountEqual(get_materialized_columns("person", use_cache=True).keys(), ["$zeta"]) + + materialize("events", "abc") + + self.assertCountEqual( + get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", *GROUPS_COLUMNS] + ) + + with freeze_time("2020-01-04T14:00:01Z"): + self.assertCountEqual( + get_materialized_columns("events", use_cache=True).keys(), ["$foo", "$bar", "abc", *GROUPS_COLUMNS] + ) + + def test_materialized_column_naming(self): + random.seed(0) + + materialize("events", "$foO();--sqlinject") + materialize("events", "$foO();ääsqlinject") + materialize("events", "$foO_____sqlinject") + materialize("person", "SoMePrOp") + + self.assertDictContainsSubset( + { + "$foO();--sqlinject": "mat_$foO_____sqlinject", + "$foO();ääsqlinject": "mat_$foO_____sqlinject_yWAc", + "$foO_____sqlinject": "mat_$foO_____sqlinject_qGFz", + }, + get_materialized_columns("events"), + ) + + self.assertEqual(get_materialized_columns("person"), {"SoMePrOp": "pmat_SoMePrOp"}) + + def test_backfilling_data(self): + sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_prop") + sync_execute("ALTER TABLE events DROP COLUMN IF EXISTS mat_another") + + _create_event( + event="some_event", distinct_id="1", team=self.team, timestamp="2020-01-01 00:00:00", properties={"prop": 1} + ) + _create_event( + event="some_event", + distinct_id="1", + team=self.team, + timestamp="2021-05-02 00:00:00", + properties={"prop": 2, "another": 5}, + ) + _create_event( + event="some_event", distinct_id="1", team=self.team, timestamp="2021-05-03 00:00:00", properties={"prop": 3} + ) + _create_event(event="another_event", distinct_id="1", team=self.team, timestamp="2021-05-04 00:00:00") + _create_event( + event="third_event", + distinct_id="1", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"prop": 4}, + ) + _create_event( + event="fourth_event", + distinct_id="1", + team=self.team, + timestamp="2021-05-06 00:00:00", + properties={"another": 6}, + ) + + materialize("events", "prop") + materialize("events", "another") + + self.assertEqual(self._count_materialized_rows("mat_prop"), 0) + self.assertEqual(self._count_materialized_rows("mat_another"), 0) + + with freeze_time("2021-05-10T14:00:01Z"): + backfill_materialized_columns( + "events", ["prop", "another"], timedelta(days=50), test_settings={"mutations_sync": "0"} + ) + + _create_event( + event="fifth_event", + distinct_id="1", + team=self.team, + timestamp="2021-05-07 00:00:00", + properties={"another": 7}, + ) + + iterations = 0 + while self._get_count_of_mutations_running() > 0 and iterations < 100: + sleep(0.1) + iterations += 1 + + self.assertGreaterEqual(self._count_materialized_rows("mat_prop"), 4) + self.assertGreaterEqual(self._count_materialized_rows("mat_another"), 4) + + self.assertEqual( + sync_execute("SELECT mat_prop, mat_another FROM events ORDER BY timestamp"), + [("1", ""), ("2", "5"), ("3", ""), ("", ""), ("4", ""), ("", "6"), ("", "7")], + ) + + def test_column_types(self): + materialize("events", "myprop") + + # :KLUDGE: ClickHouse replaces our trim(BOTH '"' FROM properties) with this + expr = "replaceRegexpAll(JSONExtractRaw(properties, 'myprop'), concat('^[', regexpQuoteMeta('\"'), ']*|[', regexpQuoteMeta('\"'), ']*$'), '')" + self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop")) + + backfill_materialized_columns("events", ["myprop"], timedelta(days=50)) + self.assertEqual(("DEFAULT", expr), self._get_column_types("events", "mat_myprop")) + + mark_all_materialized() + self.assertEqual(("MATERIALIZED", expr), self._get_column_types("events", "mat_myprop")) + + def _count_materialized_rows(self, column): + return sync_execute( + """ + SELECT sum(rows) + FROM system.parts_columns + WHERE table = 'events' + AND database = %(database)s + AND column = %(column)s + """, + {"database": CLICKHOUSE_DATABASE, "column": column}, + )[0][0] + + def _get_count_of_mutations_running(self) -> int: + return sync_execute( + """ + SELECT count(*) + FROM system.mutations + WHERE is_done = 0 + """ + )[0][0] + + def _get_column_types(self, table: str, column: str): + return sync_execute( + """ + SELECT default_kind, default_expression + FROM system.columns + WHERE database = %(database)s AND table = %(table)s AND name = %(column)s + """, + {"table": table, "database": CLICKHOUSE_DATABASE, "column": column}, + )[0] diff --git a/ee/clickhouse/materialized_columns/util.py b/ee/clickhouse/materialized_columns/util.py new file mode 100644 index 0000000000000..cd9d6836fa5b3 --- /dev/null +++ b/ee/clickhouse/materialized_columns/util.py @@ -0,0 +1,41 @@ +from datetime import timedelta +from functools import wraps +from typing import no_type_check + +from django.utils.timezone import now + +from posthog.settings import TEST + + +def cache_for(cache_time: timedelta): + def wrapper(fn): + @wraps(fn) + @no_type_check + def memoized_fn(*args, use_cache=not TEST): + if not use_cache: + return fn(*args) + + current_time = now() + if args not in memoized_fn._cache or current_time - memoized_fn._cache[args][0] > cache_time: + memoized_fn._cache[args] = (current_time, fn(*args)) + return memoized_fn._cache[args][1] + + memoized_fn._cache = {} + return memoized_fn + + return wrapper + + +def instance_memoize(callback): + name = f"_{callback.__name__}_memo" + + def _inner(self, *args): + if not hasattr(self, name): + setattr(self, name, {}) + + memo = getattr(self, name) + if args not in memo: + memo[args] = callback(self, *args) + return memo[args] + + return _inner diff --git a/ee/clickhouse/middleware.py b/ee/clickhouse/middleware.py new file mode 100644 index 0000000000000..a200c6304b564 --- /dev/null +++ b/ee/clickhouse/middleware.py @@ -0,0 +1,43 @@ +from django.conf import settings +from django.http import HttpRequest, HttpResponse +from django.urls.base import resolve +from loginas.utils import is_impersonated_session + +from posthog.internal_metrics import incr +from posthog.utils import is_clickhouse_enabled + + +class CHQueries(object): + def __init__(self, get_response): + self.get_response = get_response + + def __call__(self, request: HttpRequest): + """ Install monkey-patch on demand. + + If monkey-patch has not been run in for this process (assuming multiple preforked processes), + then do it now. + + """ + from ee.clickhouse import client + + route = resolve(request.path) + route_id = f"{route.route} ({route.func.__name__})" + client._request_information = { + "save": ( + is_clickhouse_enabled() + and request.user.pk + and (request.user.is_staff or is_impersonated_session(request) or settings.DEBUG) + ), + "user_id": request.user.pk, + "kind": "request", + "id": route_id, + } + + response: HttpResponse = self.get_response(request) + + if "api/" in route_id and "capture" not in route_id: + incr("http_api_request_response", tags={"id": route_id, "status_code": response.status_code}) + + client._request_information = None + + return response diff --git a/ee/clickhouse/migrations/0002_events_materialized.py b/ee/clickhouse/migrations/0002_events_materialized.py index 8371044975947..9df93877941e1 100644 --- a/ee/clickhouse/migrations/0002_events_materialized.py +++ b/ee/clickhouse/migrations/0002_events_materialized.py @@ -1,13 +1,3 @@ from infi.clickhouse_orm import migrations -from ee.clickhouse.sql.events import ( - EVENTS_WITH_PROPS_TABLE_SQL, - MAT_EVENT_PROP_TABLE_SQL, - MAT_EVENTS_WITH_PROPS_TABLE_SQL, -) - -operations = [ - migrations.RunSQL(EVENTS_WITH_PROPS_TABLE_SQL), - migrations.RunSQL(MAT_EVENTS_WITH_PROPS_TABLE_SQL), - migrations.RunSQL(MAT_EVENT_PROP_TABLE_SQL), -] +operations = [migrations.RunSQL("SELECT 1")] diff --git a/ee/clickhouse/migrations/0007_static_cohorts_table.py b/ee/clickhouse/migrations/0007_static_cohorts_table.py new file mode 100644 index 0000000000000..a377866bd55f3 --- /dev/null +++ b/ee/clickhouse/migrations/0007_static_cohorts_table.py @@ -0,0 +1,7 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.person import PERSON_STATIC_COHORT_TABLE_SQL + +operations = [ + migrations.RunSQL(PERSON_STATIC_COHORT_TABLE_SQL), +] diff --git a/ee/clickhouse/migrations/0008_plugin_log_entries.py b/ee/clickhouse/migrations/0008_plugin_log_entries.py new file mode 100644 index 0000000000000..0fb4b39cf61d1 --- /dev/null +++ b/ee/clickhouse/migrations/0008_plugin_log_entries.py @@ -0,0 +1,13 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.plugin_log_entries import ( + KAFKA_PLUGIN_LOG_ENTRIES_TABLE_SQL, + PLUGIN_LOG_ENTRIES_TABLE_MV_SQL, + PLUGIN_LOG_ENTRIES_TABLE_SQL, +) + +operations = [ + migrations.RunSQL(PLUGIN_LOG_ENTRIES_TABLE_SQL), + migrations.RunSQL(KAFKA_PLUGIN_LOG_ENTRIES_TABLE_SQL), + migrations.RunSQL(PLUGIN_LOG_ENTRIES_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0009_person_deleted_column.py b/ee/clickhouse/migrations/0009_person_deleted_column.py new file mode 100644 index 0000000000000..5b576e65ddf87 --- /dev/null +++ b/ee/clickhouse/migrations/0009_person_deleted_column.py @@ -0,0 +1,14 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.person import KAFKA_PERSONS_TABLE_SQL, PERSONS_TABLE_MV_SQL +from posthog.settings import CLICKHOUSE_CLUSTER + +operations = [ + migrations.RunSQL(f"DROP TABLE person_mv ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"DROP TABLE kafka_person ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL( + f"ALTER TABLE person ON CLUSTER {CLICKHOUSE_CLUSTER} ADD COLUMN IF NOT EXISTS is_deleted Boolean DEFAULT 0" + ), + migrations.RunSQL(KAFKA_PERSONS_TABLE_SQL), + migrations.RunSQL(PERSONS_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0010_cohortpeople.py b/ee/clickhouse/migrations/0010_cohortpeople.py new file mode 100644 index 0000000000000..0cda57f05afa7 --- /dev/null +++ b/ee/clickhouse/migrations/0010_cohortpeople.py @@ -0,0 +1,5 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.cohort import CREATE_COHORTPEOPLE_TABLE_SQL + +operations = [migrations.RunSQL(CREATE_COHORTPEOPLE_TABLE_SQL)] diff --git a/ee/clickhouse/migrations/0011_cohortpeople_no_shard.py b/ee/clickhouse/migrations/0011_cohortpeople_no_shard.py new file mode 100644 index 0000000000000..6f34037d65f6c --- /dev/null +++ b/ee/clickhouse/migrations/0011_cohortpeople_no_shard.py @@ -0,0 +1,7 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.cohort import CREATE_COHORTPEOPLE_TABLE_SQL, DROP_COHORTPEOPLE_TABLE_SQL +from posthog.settings import CLICKHOUSE_REPLICATION + +# run create table again with proper configuration +operations = [migrations.RunSQL(DROP_COHORTPEOPLE_TABLE_SQL), migrations.RunSQL(CREATE_COHORTPEOPLE_TABLE_SQL)] diff --git a/ee/clickhouse/migrations/0012_person_id_deleted_column.py b/ee/clickhouse/migrations/0012_person_id_deleted_column.py new file mode 100644 index 0000000000000..c8d94bda887f3 --- /dev/null +++ b/ee/clickhouse/migrations/0012_person_id_deleted_column.py @@ -0,0 +1,14 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.person import KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL, PERSONS_DISTINCT_ID_TABLE_MV_SQL +from posthog.settings import CLICKHOUSE_CLUSTER + +operations = [ + migrations.RunSQL(f"DROP TABLE person_distinct_id_mv ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"DROP TABLE kafka_person_distinct_id ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL( + f"ALTER TABLE person_distinct_id ON CLUSTER {CLICKHOUSE_CLUSTER} ADD COLUMN IF NOT EXISTS is_deleted Boolean DEFAULT 0" + ), + migrations.RunSQL(KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL), + migrations.RunSQL(PERSONS_DISTINCT_ID_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0013_persons_distinct_ids_column.py b/ee/clickhouse/migrations/0013_persons_distinct_ids_column.py new file mode 100644 index 0000000000000..1f5fe99b2ac01 --- /dev/null +++ b/ee/clickhouse/migrations/0013_persons_distinct_ids_column.py @@ -0,0 +1,14 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.person import KAFKA_PERSONS_TABLE_SQL, PERSONS_TABLE_MV_SQL +from posthog.settings import CLICKHOUSE_CLUSTER + +operations = [ + migrations.RunSQL(f"DROP TABLE person_mv ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"DROP TABLE kafka_person ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL( + f"ALTER TABLE person ON CLUSTER {CLICKHOUSE_CLUSTER} ADD COLUMN IF NOT EXISTS distinct_ids Array(VARCHAR)" + ), + migrations.RunSQL(KAFKA_PERSONS_TABLE_SQL), + migrations.RunSQL(PERSONS_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0014_persons_distinct_ids_column_remove.py b/ee/clickhouse/migrations/0014_persons_distinct_ids_column_remove.py new file mode 100644 index 0000000000000..4f302128f5274 --- /dev/null +++ b/ee/clickhouse/migrations/0014_persons_distinct_ids_column_remove.py @@ -0,0 +1,12 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.person import KAFKA_PERSONS_TABLE_SQL, PERSONS_TABLE_MV_SQL +from posthog.settings import CLICKHOUSE_CLUSTER + +operations = [ + migrations.RunSQL(f"DROP TABLE person_mv ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"DROP TABLE kafka_person ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"ALTER TABLE person ON CLUSTER {CLICKHOUSE_CLUSTER} DROP COLUMN IF EXISTS distinct_ids"), + migrations.RunSQL(KAFKA_PERSONS_TABLE_SQL), + migrations.RunSQL(PERSONS_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0015_materialized_column_comments.py b/ee/clickhouse/migrations/0015_materialized_column_comments.py new file mode 100644 index 0000000000000..4636485e731aa --- /dev/null +++ b/ee/clickhouse/migrations/0015_materialized_column_comments.py @@ -0,0 +1,15 @@ +from infi.clickhouse_orm import migrations + +from posthog.settings import CLICKHOUSE_CLUSTER + +already_materialized_columns = [ + ("properties_issampledevent", "isSampledEvent"), + ("properties_currentscreen", "currentScreen"), + ("properties_objectname", "objectName"), +] + +operations = [] + +for column_name, property in already_materialized_columns: + statement = f"ALTER TABLE events ON CLUSTER {CLICKHOUSE_CLUSTER} COMMENT COLUMN IF EXISTS {column_name} 'column_materializer::{property}'" + operations.append(migrations.RunSQL(statement)) diff --git a/ee/clickhouse/migrations/0016_collapsing_person_distinct_id.py b/ee/clickhouse/migrations/0016_collapsing_person_distinct_id.py new file mode 100644 index 0000000000000..d9fcc6e0f50f9 --- /dev/null +++ b/ee/clickhouse/migrations/0016_collapsing_person_distinct_id.py @@ -0,0 +1,35 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.person import * +from posthog.settings import CLICKHOUSE_CLUSTER + +TEMPORARY_TABLE_NAME = "person_distinct_id_tmp_migration_0016" + +operations = [ + migrations.RunSQL(PERSONS_DISTINCT_ID_TABLE_SQL.replace(PERSONS_DISTINCT_ID_TABLE, TEMPORARY_TABLE_NAME, 1)), + migrations.RunSQL(f"DROP TABLE person_distinct_id_mv ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"DROP TABLE kafka_person_distinct_id ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL( + f""" + INSERT INTO {TEMPORARY_TABLE_NAME} (distinct_id, person_id, team_id, _sign, _timestamp, _offset) + SELECT + distinct_id, + person_id, + team_id, + if(is_deleted==0, 1, -1) as _sign, + _timestamp, + _offset + FROM {PERSONS_DISTINCT_ID_TABLE} + """ + ), + migrations.RunSQL( + f""" + RENAME TABLE + {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE} to {CLICKHOUSE_DATABASE}.person_distinct_id_backup, + {CLICKHOUSE_DATABASE}.{TEMPORARY_TABLE_NAME} to {CLICKHOUSE_DATABASE}.{PERSONS_DISTINCT_ID_TABLE} + ON CLUSTER {CLICKHOUSE_CLUSTER} + """ + ), + migrations.RunSQL(KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL), + migrations.RunSQL(PERSONS_DISTINCT_ID_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0017_events_dead_letter_queue.py b/ee/clickhouse/migrations/0017_events_dead_letter_queue.py new file mode 100644 index 0000000000000..1dcf048f2ea6f --- /dev/null +++ b/ee/clickhouse/migrations/0017_events_dead_letter_queue.py @@ -0,0 +1,13 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.dead_letter_queue import ( + DEAD_LETTER_QUEUE_TABLE_MV_SQL, + DEAD_LETTER_QUEUE_TABLE_SQL, + KAFKA_DEAD_LETTER_QUEUE_TABLE_SQL, +) + +operations = [ + migrations.RunSQL(DEAD_LETTER_QUEUE_TABLE_SQL), + migrations.RunSQL(KAFKA_DEAD_LETTER_QUEUE_TABLE_SQL), + migrations.RunSQL(DEAD_LETTER_QUEUE_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0018_group_analytics_schema.py b/ee/clickhouse/migrations/0018_group_analytics_schema.py new file mode 100644 index 0000000000000..62c0aedb3c13d --- /dev/null +++ b/ee/clickhouse/migrations/0018_group_analytics_schema.py @@ -0,0 +1,9 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.groups import GROUPS_TABLE_MV_SQL, GROUPS_TABLE_SQL, KAFKA_GROUPS_TABLE_SQL + +operations = [ + migrations.RunSQL(GROUPS_TABLE_SQL), + migrations.RunSQL(KAFKA_GROUPS_TABLE_SQL), + migrations.RunSQL(GROUPS_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/migrations/0019_group_analytics_materialized_columns.py b/ee/clickhouse/migrations/0019_group_analytics_materialized_columns.py new file mode 100644 index 0000000000000..f50a583321c3c --- /dev/null +++ b/ee/clickhouse/migrations/0019_group_analytics_materialized_columns.py @@ -0,0 +1,18 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.materialized_columns.columns import materialize + + +def create_materialized_columns(database): + try: + materialize("events", "$group_0", "$group_0") + materialize("events", "$group_1", "$group_1") + materialize("events", "$group_2", "$group_2") + materialize("events", "$group_3", "$group_3") + materialize("events", "$group_4", "$group_4") + except ValueError: + # Group is already materialized, skip + pass + + +operations = [migrations.RunPython(create_materialized_columns)] diff --git a/ee/clickhouse/migrations/0020_session_recording_events_window_id.py b/ee/clickhouse/migrations/0020_session_recording_events_window_id.py new file mode 100644 index 0000000000000..5bf6fa9400b99 --- /dev/null +++ b/ee/clickhouse/migrations/0020_session_recording_events_window_id.py @@ -0,0 +1,17 @@ +from infi.clickhouse_orm import migrations + +from ee.clickhouse.sql.session_recording_events import ( + KAFKA_SESSION_RECORDING_EVENTS_TABLE_SQL, + SESSION_RECORDING_EVENTS_TABLE_MV_SQL, +) +from posthog.settings import CLICKHOUSE_CLUSTER + +operations = [ + migrations.RunSQL(f"DROP TABLE session_recording_events_mv ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL(f"DROP TABLE kafka_session_recording_events ON CLUSTER {CLICKHOUSE_CLUSTER}"), + migrations.RunSQL( + f"ALTER TABLE session_recording_events ON CLUSTER {CLICKHOUSE_CLUSTER} ADD COLUMN IF NOT EXISTS window_id VARCHAR AFTER session_id" + ), + migrations.RunSQL(KAFKA_SESSION_RECORDING_EVENTS_TABLE_SQL), + migrations.RunSQL(SESSION_RECORDING_EVENTS_TABLE_MV_SQL), +] diff --git a/ee/clickhouse/models/action.py b/ee/clickhouse/models/action.py index 4c679b4aeaf7f..58809ab2e28e9 100644 --- a/ee/clickhouse/models/action.py +++ b/ee/clickhouse/models/action.py @@ -1,17 +1,24 @@ -import re -from typing import Dict, List, Tuple +from typing import Counter, Dict, List, Tuple from django.forms.models import model_to_dict -from posthog.constants import AUTOCAPTURE_EVENT -from posthog.models import Action, Filter +from ee.clickhouse.models.util import PersonPropertiesMode +from posthog.constants import AUTOCAPTURE_EVENT, TREND_FILTER_TYPE_ACTIONS +from posthog.models import Action, Entity, Filter from posthog.models.action_step import ActionStep -from posthog.models.event import Selector +from posthog.models.property import Property, PropertyIdentifier, PropertyName, PropertyType -def format_action_filter(action: Action, prepend: str = "", index=0, use_loop: bool = False) -> Tuple[str, Dict]: +def format_action_filter( + action: Action, + prepend: str = "action", + use_loop: bool = False, + filter_by_team=True, + table_name: str = "", + person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_SUBQUERY, +) -> Tuple[str, Dict]: # get action steps - params = {"team_id": action.team.pk} + params = {"team_id": action.team.pk} if filter_by_team else {} steps = action.steps.all() if len(steps) == 0: # If no steps, it shouldn't match this part of the query @@ -22,12 +29,15 @@ def format_action_filter(action: Action, prepend: str = "", index=0, use_loop: b conditions: List[str] = [] # filter element if step.event == AUTOCAPTURE_EVENT: - el_conditions, element_params = filter_element(step, "{}{}".format(index, prepend)) + from ee.clickhouse.models.property import filter_element # prevent circular import + + el_condition, element_params = filter_element(model_to_dict(step), prepend=f"{action.pk}_{index}{prepend}") params = {**params, **element_params} - conditions += el_conditions + if len(el_condition) > 0: + conditions.append(el_condition) # filter event conditions (ie URL) - event_conditions, event_params = filter_event(step, "{}{}".format(index, prepend), index) + event_conditions, event_params = filter_event(step, f"{action.pk}_{index}{prepend}", index, table_name) params = {**params, **event_params} conditions += event_conditions @@ -35,7 +45,11 @@ def format_action_filter(action: Action, prepend: str = "", index=0, use_loop: b from ee.clickhouse.models.property import parse_prop_clauses prop_query, prop_params = parse_prop_clauses( - Filter(data={"properties": step.properties}).properties, action.team.pk + Filter(data={"properties": step.properties}).properties, + team_id=action.team.pk if filter_by_team else None, + prepend=f"action_props_{action.pk}_{step.pk}", + table_name=table_name, + person_properties_mode=person_properties_mode, ) conditions.append(prop_query.replace("AND", "", 1)) params = {**params, **prop_params} @@ -51,72 +65,64 @@ def format_action_filter(action: Action, prepend: str = "", index=0, use_loop: b return formatted_query, params -def filter_event(step: ActionStep, prepend: str = "", index: int = 0) -> Tuple[List[str], Dict]: - params = {} +def filter_event( + step: ActionStep, prepend: str = "event", index: int = 0, table_name: str = "" +) -> Tuple[List[str], Dict]: + from ee.clickhouse.models.property import get_property_string_expr + + params = {"{}_{}".format(prepend, index): step.event} conditions = [] + if table_name != "": + table_name += "." + if step.url: + value_expr, _ = get_property_string_expr("events", "$current_url", "'$current_url'", f"{table_name}properties") + prop_name = f"{prepend}_prop_val_{index}" if step.url_matching == ActionStep.EXACT: - conditions.append( - "JSONExtractString(properties, '$current_url') = %({}_prop_val_{})s".format(prepend, index) - ) - params.update({"{}_prop_val_{}".format(prepend, index): step.url}) + conditions.append(f"{value_expr} = %({prop_name})s") + params.update({prop_name: step.url}) elif step.url_matching == ActionStep.REGEX: - conditions.append( - "match(JSONExtractString(properties, '$current_url'), %({}_prop_val_{})s)".format(prepend, index) - ) - params.update({"{}_prop_val_{}".format(prepend, index): step.url}) + conditions.append(f"match({value_expr}, %({prop_name})s)") + params.update({prop_name: step.url}) else: - conditions.append( - "JSONExtractString(properties, '$current_url') LIKE %({}_prop_val_{})s".format(prepend, index) - ) - params.update({"{}_prop_val_{}".format(prepend, index): "%" + step.url + "%"}) + conditions.append(f"{value_expr} LIKE %({prop_name})s") + params.update({prop_name: f"%{step.url}%"}) - conditions.append("event = '{}'".format(step.event)) + conditions.append(f"event = %({prepend}_{index})s") return conditions, params -def _create_regex(selector: Selector) -> str: - regex = r"" - for idx, tag in enumerate(selector.parts): - if tag.data.get("tag_name") and isinstance(tag.data["tag_name"], str): - regex += tag.data["tag_name"] - if tag.data.get("attr_class__contains"): - regex += r".*?\.{}".format(r"\..*?".join(sorted(tag.data["attr_class__contains"]))) - if tag.ch_attributes: - regex += ".*?" - for key, value in sorted(tag.ch_attributes.items()): - regex += '{}="{}".*?'.format(key, value) - regex += r"([-_a-zA-Z0-9\.]*?)?($|;|:([^;^\s]*(;|$|\s)))" - if tag.direct_descendant: - regex += ".*" - return regex - - -def filter_element(step: ActionStep, prepend: str = "") -> Tuple[List[str], Dict]: - filters = model_to_dict(step) - params = {} - conditions = [] +def format_entity_filter(entity: Entity, prepend: str = "action", filter_by_team=True) -> Tuple[str, Dict]: + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = entity.get_action() + entity_filter, params = format_action_filter(action, prepend=prepend, filter_by_team=filter_by_team) + else: + key = f"{prepend}_event" + entity_filter = f"event = %({key})s" + params = {key: entity.id} - if filters.get("selector"): - selector = Selector(filters["selector"], escape_slashes=False) - params["{}selector_regex".format(prepend)] = _create_regex(selector) - conditions.append("match(elements_chain, %({}selector_regex)s)".format(prepend)) + return entity_filter, params - if filters.get("tag_name"): - params["{}tag_name_regex".format(prepend)] = r"(^|;){}(\.|$|;|:)".format(filters["tag_name"]) - conditions.append("match(elements_chain, %({}tag_name_regex)s)".format(prepend)) - attributes: Dict[str, str] = {} - for key in ["href", "text"]: - if filters.get(key): - attributes[key] = re.escape(filters[key]) +def get_action_tables_and_properties(action: Action) -> Counter[PropertyIdentifier]: + from ee.clickhouse.models.property import extract_tables_and_properties + + result: Counter[PropertyIdentifier] = Counter() + + for action_step in action.steps.all(): + if action_step.url: + result[("$current_url", "event", None)] += 1 + result += extract_tables_and_properties(Filter(data={"properties": action_step.properties or []}).properties) + + return result - if len(attributes.keys()) > 0: - params["{}attributes_regex".format(prepend)] = ".*?({}).*?".format( - ".*?".join(['{}="{}"'.format(key, value) for key, value in attributes.items()]) - ) - conditions.append("match(elements_chain, %({}attributes_regex)s)".format(prepend)) - return (conditions, params) +def uses_elements_chain(action: Action) -> bool: + for action_step in action.steps.all(): + if any(Property(**prop).type == "element" for prop in (action_step.properties or [])): + return True + if any(getattr(action_step, attribute) is not None for attribute in ["selector", "tag_name", "href", "text"]): + return True + return False diff --git a/ee/clickhouse/models/clickhouse.py b/ee/clickhouse/models/clickhouse.py deleted file mode 100644 index d995e4cc1e2d9..0000000000000 --- a/ee/clickhouse/models/clickhouse.py +++ /dev/null @@ -1,5 +0,0 @@ -from posthog.models.utils import UUIDT - - -def generate_clickhouse_uuid() -> str: - return str(UUIDT()) diff --git a/ee/clickhouse/models/cohort.py b/ee/clickhouse/models/cohort.py index d14566d33274a..60824df8d3234 100644 --- a/ee/clickhouse/models/cohort.py +++ b/ee/clickhouse/models/cohort.py @@ -1,45 +1,240 @@ -from typing import Any, Dict, Tuple +import uuid +from datetime import datetime, timedelta +from typing import Any, Dict, List, Optional, Tuple, Union + +import structlog +from dateutil import parser +from django.conf import settings +from django.utils import timezone +from rest_framework.exceptions import ValidationError from ee.clickhouse.client import sync_execute from ee.clickhouse.models.action import format_action_filter -from ee.clickhouse.sql.cohort import CALCULATE_COHORT_PEOPLE_SQL -from ee.clickhouse.sql.person import GET_LATEST_PERSON_ID_SQL, GET_PERSON_IDS_BY_FILTER +from ee.clickhouse.sql.cohort import ( + CALCULATE_COHORT_PEOPLE_SQL, + GET_COHORT_SIZE_SQL, + GET_DISTINCT_ID_BY_ENTITY_SQL, + GET_PERSON_ID_BY_ENTITY_COUNT_SQL, + GET_PERSON_ID_BY_PRECALCULATED_COHORT_ID, + INSERT_PEOPLE_MATCHING_COHORT_ID_SQL, + REMOVE_PEOPLE_NOT_MATCHING_COHORT_ID_SQL, +) +from ee.clickhouse.sql.person import ( + GET_LATEST_PERSON_ID_SQL, + GET_PERSON_IDS_BY_FILTER, + GET_TEAM_PERSON_DISTINCT_IDS, + INSERT_PERSON_STATIC_COHORT, + PERSON_STATIC_COHORT_TABLE, +) from posthog.models import Action, Cohort, Filter, Team +from posthog.models.property import Property + +# temporary marker to denote when cohortpeople table started being populated +TEMP_PRECALCULATED_MARKER = parser.parse("2021-06-07T15:00:00+00:00") +logger = structlog.get_logger(__name__) -def format_person_query(cohort: Cohort) -> Tuple[str, Dict[str, Any]]: + +def format_person_query( + cohort: Cohort, index: int, *, custom_match_field: str = "person_id" +) -> Tuple[str, Dict[str, Any]]: filters = [] params: Dict[str, Any] = {} - for group_idx, group in enumerate(cohort.groups): - if group.get("action_id"): - action = Action.objects.get(pk=group["action_id"], team_id=cohort.team.pk) - action_filter_query, action_params = format_action_filter(action) - extract_person = "SELECT distinct_id FROM events WHERE team_id = %(team_id)s AND {query}".format( - query=action_filter_query - ) - params = {**params, **action_params} - filters.append("distinct_id IN (" + extract_person + ")") + + if cohort.is_static: + return format_static_cohort_query(cohort.pk, index, prepend="", custom_match_field=custom_match_field) + + or_queries = [] + groups = cohort.groups + + if not groups: + # No person can match a cohort that has no match groups + return "0 = 19", {} + + for group_idx, group in enumerate(groups): + if group.get("action_id") or group.get("event_id"): + entity_query, entity_params = get_entity_cohort_subquery(cohort, group, group_idx) + params = {**params, **entity_params} + filters.append(entity_query) elif group.get("properties"): - from ee.clickhouse.models.property import prop_filter_json_extract - - filter = Filter(data=group) - query = "" - for idx, prop in enumerate(filter.properties): - filter_query, filter_params = prop_filter_json_extract( - prop=prop, idx=idx, prepend="{}_{}_{}_person".format(cohort.pk, group_idx, idx) - ) - params = {**params, **filter_params} - query += " {}".format(filter_query) - filters.append("person_id IN {}".format(GET_LATEST_PERSON_ID_SQL.format(query=query))) + prop_query, prop_params = get_properties_cohort_subquery(cohort, group, group_idx) + or_queries.append(prop_query) + params = {**params, **prop_params} + + if len(or_queries) > 0: + query = "AND ({})".format(" OR ".join(or_queries)) + filters.append("{} IN {}".format(custom_match_field, GET_LATEST_PERSON_ID_SQL.format(query=query))) joined_filter = " OR ".join(filters) return joined_filter, params -def format_filter_query(cohort: Cohort) -> Tuple[str, Dict[str, Any]]: - person_query, params = format_person_query(cohort) - person_id_query = CALCULATE_COHORT_PEOPLE_SQL.format(query=person_query) +def format_static_cohort_query( + cohort_id: int, index: int, prepend: str, custom_match_field: str +) -> Tuple[str, Dict[str, Any]]: + return ( + f"{custom_match_field} IN (SELECT person_id FROM {PERSON_STATIC_COHORT_TABLE} WHERE cohort_id = %({prepend}_cohort_id_{index})s AND team_id = %(team_id)s)", + {f"{prepend}_cohort_id_{index}": cohort_id}, + ) + + +def format_precalculated_cohort_query( + cohort_id: int, index: int, prepend: str = "", custom_match_field="person_id" +) -> Tuple[str, Dict[str, Any]]: + filter_query = GET_PERSON_ID_BY_PRECALCULATED_COHORT_ID.format(index=index, prepend=prepend) + return ( + f""" + {custom_match_field} IN ({filter_query}) + """, + {f"{prepend}_cohort_id_{index}": cohort_id}, + ) + + +def get_properties_cohort_subquery(cohort: Cohort, cohort_group: Dict, group_idx: int) -> Tuple[str, Dict[str, Any]]: + from ee.clickhouse.models.property import prop_filter_json_extract + + filter = Filter(data=cohort_group) + params: Dict[str, Any] = {} + + query_parts = [] + for idx, prop in enumerate(filter.properties): + if prop.type == "cohort": + try: + prop_cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=cohort.team_id) + except Cohort.DoesNotExist: + return "0 = 14", {} + if prop_cohort.pk == cohort.pk: + # If we've encountered a cyclic dependency (meaning this cohort depends on this cohort), + # we treat it as satisfied for all persons + query_parts.append("AND 11 = 11") + else: + person_id_query, cohort_filter_params = format_filter_query(prop_cohort, idx, "person_id") + params.update(cohort_filter_params) + query_parts.append(f"AND person.id IN ({person_id_query})") + else: + filter_query, filter_params = prop_filter_json_extract( + prop=prop, + idx=idx, + prepend="{}_{}_{}_person".format(cohort.pk, group_idx, idx), + allow_denormalized_props=False, + ) + params.update(filter_params) + query_parts.append(filter_query) + + return "\n".join(query_parts).replace("AND ", "", 1), params + + +def get_entity_cohort_subquery(cohort: Cohort, cohort_group: Dict, group_idx: int): + event_id = cohort_group.get("event_id") + action_id = cohort_group.get("action_id") + days = cohort_group.get("days") + start_time = cohort_group.get("start_date") + end_time = cohort_group.get("end_date") + count = cohort_group.get("count") + count_operator = cohort_group.get("count_operator") + + date_query, date_params = get_date_query(days, start_time, end_time) + entity_query, entity_params = _get_entity_query(event_id, action_id, cohort.team.pk, group_idx) + + if count: + count_operator = _get_count_operator(count_operator) + extract_person = GET_PERSON_ID_BY_ENTITY_COUNT_SQL.format( + entity_query=entity_query, + date_query=date_query, + count_operator=count_operator, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ) + params: Dict[str, Union[str, int]] = {"count": int(count), **entity_params, **date_params} + return f"person_id IN ({extract_person})", params + else: + extract_person = GET_DISTINCT_ID_BY_ENTITY_SQL.format(entity_query=entity_query, date_query=date_query,) + return f"distinct_id IN ({extract_person})", {**entity_params, **date_params} + + +def _get_count_operator(count_operator: Optional[str]) -> str: + if count_operator == "gte": + return ">=" + elif count_operator == "lte": + return "<=" + elif count_operator == "eq" or count_operator is None: + return "=" + else: + raise ValidationError("count_operator must be gte, lte, eq, or None") + + +def _get_entity_query( + event_id: Optional[str], action_id: Optional[int], team_id: int, group_idx: int +) -> Tuple[str, Dict[str, str]]: + if event_id: + return "event = %(event)s", {"event": event_id} + elif action_id: + action = Action.objects.get(pk=action_id, team_id=team_id) + action_filter_query, action_params = format_action_filter(action, prepend="_{}_action".format(group_idx)) + return action_filter_query, action_params + else: + raise ValidationError("Cohort query requires action_id or event_id") + + +def get_date_query( + days: Optional[str], start_time: Optional[str], end_time: Optional[str] +) -> Tuple[str, Dict[str, str]]: + date_query: str = "" + date_params: Dict[str, str] = {} + if days: + date_query, date_params = parse_entity_timestamps_in_days(int(days)) + elif start_time or end_time: + date_query, date_params = parse_cohort_timestamps(start_time, end_time) + + return date_query, date_params + + +def parse_entity_timestamps_in_days(days: int) -> Tuple[str, Dict[str, str]]: + curr_time = timezone.now() + start_time = curr_time - timedelta(days=days) + + return ( + "AND timestamp >= %(date_from)s AND timestamp <= %(date_to)s", + {"date_from": start_time.strftime("%Y-%m-%d %H:%M:%S"), "date_to": curr_time.strftime("%Y-%m-%d %H:%M:%S")}, + ) + + +def parse_cohort_timestamps(start_time: Optional[str], end_time: Optional[str]) -> Tuple[str, Dict[str, str]]: + clause = "AND " + params: Dict[str, str] = {} + + if start_time: + clause += "timestamp >= %(date_from)s" + + params = {"date_from": datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d %H:%M:%S")} + if end_time: + clause += "timestamp <= %(date_to)s" + params = {**params, "date_to": datetime.strptime(end_time, "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d %H:%M:%S")} + + return clause, params + + +def is_precalculated_query(cohort: Cohort) -> bool: + if ( + cohort.last_calculation + and cohort.last_calculation > TEMP_PRECALCULATED_MARKER + and settings.USE_PRECALCULATED_CH_COHORT_PEOPLE + and not cohort.is_static # static cohorts are handled within the regular cohort filter query path + ): + return True + else: + return False + + +def format_filter_query(cohort: Cohort, index: int = 0, id_column: str = "distinct_id") -> Tuple[str, Dict[str, Any]]: + is_precalculated = is_precalculated_query(cohort) + person_query, params = ( + format_precalculated_cohort_query(cohort.pk, index) if is_precalculated else format_person_query(cohort, index) + ) + + person_id_query = CALCULATE_COHORT_PEOPLE_SQL.format( + query=person_query, id_column=id_column, GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS + ) return person_id_query, params @@ -47,8 +242,89 @@ def get_person_ids_by_cohort_id(team: Team, cohort_id: int): from ee.clickhouse.models.property import parse_prop_clauses filters = Filter(data={"properties": [{"key": "id", "value": cohort_id, "type": "cohort"}],}) - filter_query, filter_params = parse_prop_clauses(filters.properties, team.pk, table_name="pid") + filter_query, filter_params = parse_prop_clauses(filters.properties, team.pk, table_name="pdi") - results = sync_execute(GET_PERSON_IDS_BY_FILTER.format(distinct_query=filter_query, query=""), filter_params) + results = sync_execute(GET_PERSON_IDS_BY_FILTER.format(distinct_query=filter_query, query=""), filter_params,) return [str(row[0]) for row in results] + + +def insert_static_cohort(person_uuids: List[Optional[uuid.UUID]], cohort_id: int, team: Team): + persons = ( + { + "id": str(uuid.uuid4()), + "person_id": str(person_uuid), + "cohort_id": cohort_id, + "team_id": team.pk, + "_timestamp": datetime.now(), + } + for person_uuid in person_uuids + ) + sync_execute(INSERT_PERSON_STATIC_COHORT, persons) + + +def recalculate_cohortpeople(cohort: Cohort): + cohort_filter, cohort_params = format_person_query(cohort, 0, custom_match_field="id") + + before_count = sync_execute(GET_COHORT_SIZE_SQL, {"cohort_id": cohort.pk, "team_id": cohort.team_id}) + logger.info( + "Recalculating cohortpeople starting", + team_id=cohort.team_id, + cohort_id=cohort.pk, + size_before=before_count[0][0], + ) + + cohort_filter = GET_PERSON_IDS_BY_FILTER.format(distinct_query="AND " + cohort_filter, query="") + + insert_cohortpeople_sql = INSERT_PEOPLE_MATCHING_COHORT_ID_SQL.format(cohort_filter=cohort_filter) + sync_execute(insert_cohortpeople_sql, {**cohort_params, "cohort_id": cohort.pk, "team_id": cohort.team_id}) + + remove_cohortpeople_sql = REMOVE_PEOPLE_NOT_MATCHING_COHORT_ID_SQL.format(cohort_filter=cohort_filter) + sync_execute(remove_cohortpeople_sql, {**cohort_params, "cohort_id": cohort.pk, "team_id": cohort.team_id}) + + count = sync_execute(GET_COHORT_SIZE_SQL, {"cohort_id": cohort.pk, "team_id": cohort.team_id}) + logger.info( + "Recalculating cohortpeople done", + team_id=cohort.team_id, + cohort_id=cohort.pk, + size_before=before_count[0][0], + size=count[0][0], + ) + + +def simplified_cohort_filter_properties(cohort: Cohort, team: Team) -> List[Property]: + """ + 'Simplifies' cohort property filters, removing team-specific context from properties. + """ + from ee.clickhouse.models.cohort import is_precalculated_query + + if cohort.is_static: + return [Property(type="static-cohort", key="id", value=cohort.pk)] + + # Cohort has been precalculated + if is_precalculated_query(cohort): + return [Property(type="precalculated-cohort", key="id", value=cohort.pk)] + + # Cohort can have multiple match groups. + # Each group is either + # 1. "user has done X in time range Y at least N times" or + # 2. "user has properties XYZ", including belonging to another cohort + # + # Users who match _any_ of the groups are considered to match the cohort. + group_filters: List[List[Property]] = [] + for group in cohort.groups: + if group.get("action_id") or group.get("event_id"): + # :TODO: Support hasdone as separate property type + return [Property(type="cohort", key="id", value=cohort.pk)] + elif group.get("properties"): + # :TRICKY: This will recursively simplify all the properties + filter = Filter(data=group, team=team) + group_filters.append(filter.properties) + + if len(group_filters) > 1: + # :TODO: Support or properties + return [Property(type="cohort", key="id", value=cohort.pk)] + elif len(group_filters) == 1: + return group_filters[0] + else: + return [] diff --git a/ee/clickhouse/models/entity.py b/ee/clickhouse/models/entity.py new file mode 100644 index 0000000000000..65c4d31040b0b --- /dev/null +++ b/ee/clickhouse/models/entity.py @@ -0,0 +1,29 @@ +from typing import Any, Dict, Tuple + +from ee.clickhouse.models.action import format_action_filter +from ee.clickhouse.models.util import PersonPropertiesMode +from posthog.constants import TREND_FILTER_TYPE_ACTIONS +from posthog.models.entity import Entity + + +def get_entity_filtering_params( + entity: Entity, + team_id: int, + table_name: str = "", + *, + person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_PERSON_PROPERTIES_COLUMN, +) -> Tuple[Dict, Dict]: + params: Dict[str, Any] = {} + content_sql_params: Dict[str, str] + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = entity.get_action() + action_query, action_params = format_action_filter( + action, table_name=table_name, person_properties_mode=person_properties_mode, + ) + params.update(action_params) + content_sql_params = {"entity_query": f"AND {action_query}"} + else: + params["event"] = entity.id + content_sql_params = {"entity_query": f"AND event = %(event)s"} + + return params, content_sql_params diff --git a/ee/clickhouse/models/event.py b/ee/clickhouse/models/event.py index a4d29732e2e6c..810e9c4285313 100644 --- a/ee/clickhouse/models/event.py +++ b/ee/clickhouse/models/event.py @@ -5,15 +5,21 @@ import celery import pytz from dateutil.parser import isoparse +from django.conf import settings from django.utils import timezone from rest_framework import serializers +from sentry_sdk import capture_exception +from statshog.defaults.django import statsd from ee.clickhouse.client import sync_execute from ee.clickhouse.models.element import chain_to_elements, elements_to_string -from ee.clickhouse.sql.events import GET_EVENTS_BY_TEAM_SQL, GET_EVENTS_SQL, INSERT_EVENT_SQL +from ee.clickhouse.sql.events import GET_EVENTS_BY_TEAM_SQL, INSERT_EVENT_SQL from ee.idl.gen import events_pb2 from ee.kafka_client.client import ClickhouseProducer from ee.kafka_client.topics import KAFKA_EVENTS +from ee.models.hook import Hook +from posthog.constants import AvailableFeature +from posthog.models.action_step import ActionStep from posthog.models.element import Element from posthog.models.person import Person from posthog.models.team import Team @@ -29,7 +35,6 @@ def create_event( elements: Optional[List[Element]] = None, site_url: Optional[str] = None, ) -> str: - if not timestamp: timestamp = timezone.now() assert timestamp is not None @@ -61,11 +66,6 @@ def create_event( return str(event_uuid) -def get_events(): - events = sync_execute(GET_EVENTS_SQL) - return ClickhouseEventSerializer(events, many=True, context={"elements": None, "people": None}).data - - def get_events_by_team(team_id: Union[str, int]): events = sync_execute(GET_EVENTS_BY_TEAM_SQL, {"team_id": str(team_id)}) return ClickhouseEventSerializer(events, many=True, context={"elements": None, "people": None}).data @@ -112,7 +112,9 @@ def get_properties(self, event): prop_vals = [res.strip('"') for res in event[9]] return dict(zip(event[8], prop_vals)) else: - props = json.loads(event[2]) + # parse_constants gets called for any NaN, Infinity etc values + # we just want those to be returned as None + props = json.loads(event[2], parse_constant=lambda x: None) unpadded = {key: value.strip('"') if isinstance(value, str) else value for key, value in props.items()} return unpadded @@ -125,8 +127,16 @@ def get_timestamp(self, event): def get_person(self, event): if not self.context.get("people") or event[5] not in self.context["people"]: - return event[5] - return self.context["people"][event[5]].properties.get("email", event[5]) + return None + + person = self.context["people"][event[5]] + return { + "is_identified": person.is_identified, + "distinct_ids": person.distinct_ids[:1], # only send the first one to avoid a payload bloat + "properties": { + key: person.properties[key] for key in ["email", "name", "username"] if key in person.properties + }, + } def get_elements(self, event): if not event[6]: @@ -137,13 +147,15 @@ def get_elements_chain(self, event): return event[6] -def determine_event_conditions(conditions: Dict[str, Union[str, List[str]]]) -> Tuple[str, Dict]: +def determine_event_conditions( + team: Team, conditions: Dict[str, Union[str, List[str]]], long_date_from: bool +) -> Tuple[str, Dict]: result = "" params: Dict[str, Union[str, List[str]]] = {} for idx, (k, v) in enumerate(conditions.items()): if not isinstance(v, str): continue - if k == "after": + if k == "after" and not long_date_from: timestamp = isoparse(v).strftime("%Y-%m-%d %H:%M:%S.%f") result += "AND timestamp > %(after)s" params.update({"after": timestamp}) @@ -153,9 +165,9 @@ def determine_event_conditions(conditions: Dict[str, Union[str, List[str]]]) -> params.update({"before": timestamp}) elif k == "person_id": result += """AND distinct_id IN (%(distinct_ids)s)""" - distinct_ids = Person.objects.filter(pk=v)[0].distinct_ids - distinct_ids = [distinct_id.__str__() for distinct_id in distinct_ids] - params.update({"distinct_ids": distinct_ids}) + person = Person.objects.filter(pk=v, team_id=team.pk).first() + distinct_ids = person.distinct_ids if person is not None else [] + params.update({"distinct_ids": list(map(str, distinct_ids))}) elif k == "distinct_id": result += "AND distinct_id = %(distinct_id)s" params.update({"distinct_id": v}) @@ -163,3 +175,126 @@ def determine_event_conditions(conditions: Dict[str, Union[str, List[str]]]) -> result += "AND event = %(event)s" params.update({"event": v}) return result, params + + +def get_event_count_for_team_and_period( + team_id: Union[str, int], begin: timezone.datetime, end: timezone.datetime +) -> int: + result = sync_execute( + """ + SELECT count(1) as count + FROM events + WHERE team_id = %(team_id)s + AND timestamp between %(begin)s AND %(end)s + """, + {"team_id": str(team_id), "begin": begin, "end": end}, + )[0][0] + return result + + +def get_agg_event_count_for_teams(team_ids: List[Union[str, int]]) -> int: + result = sync_execute( + """ + SELECT count(1) as count + FROM events + WHERE team_id IN (%(team_id_clause)s) + """, + {"team_id_clause": team_ids}, + )[0][0] + return result + + +def get_agg_event_count_for_teams_and_period( + team_ids: List[Union[str, int]], begin: timezone.datetime, end: timezone.datetime +) -> int: + result = sync_execute( + """ + SELECT count(1) as count + FROM events + WHERE team_id IN (%(team_id_clause)s) + AND timestamp between %(begin)s AND %(end)s + """, + {"team_id_clause": team_ids, "begin": begin, "end": end}, + )[0][0] + return result + + +def get_event_count_for_team(team_id: Union[str, int]) -> int: + result = sync_execute( + """ + SELECT count(1) as count + FROM events + WHERE team_id = %(team_id)s + """, + {"team_id": str(team_id)}, + )[0][0] + return result + + +def get_event_count() -> int: + result = sync_execute( + """ + SELECT count(1) as count + FROM events + """ + )[0][0] + return result + + +def get_event_count_for_last_month() -> int: + result = sync_execute( + """ + -- count of events last month + SELECT + COUNT(1) freq + FROM events + WHERE + toStartOfMonth(timestamp) = toStartOfMonth(date_sub(MONTH, 1, now())) + """ + )[0][0] + return result + + +def get_event_count_month_to_date() -> int: + result = sync_execute( + """ + -- count of events month to date + SELECT + COUNT(1) freq + FROM events + WHERE toStartOfMonth(timestamp) = toStartOfMonth(now()); + """ + )[0][0] + return result + + +def get_events_count_for_team_by_client_lib( + team_id: Union[str, int], begin: timezone.datetime, end: timezone.datetime +) -> dict: + results = sync_execute( + """ + SELECT JSONExtractString(properties, '$lib') as lib, COUNT(1) as freq + FROM events + WHERE team_id = %(team_id)s + AND timestamp between %(begin)s AND %(end)s + GROUP BY lib + """, + {"team_id": str(team_id), "begin": begin, "end": end}, + ) + return {result[0]: result[1] for result in results} + + +def get_events_count_for_team_by_event_type( + team_id: Union[str, int], begin: timezone.datetime, end: timezone.datetime +) -> dict: + results = sync_execute( + """ + SELECT event, COUNT(1) as freq + FROM events + WHERE team_id = %(team_id)s + AND timestamp between %(begin)s AND %(end)s + GROUP BY event + """, + {"team_id": str(team_id), "begin": begin, "end": end}, + ) + return {result[0]: result[1] for result in results} diff --git a/ee/clickhouse/models/group.py b/ee/clickhouse/models/group.py new file mode 100644 index 0000000000000..f4333cb70bba3 --- /dev/null +++ b/ee/clickhouse/models/group.py @@ -0,0 +1,41 @@ +import datetime +import json +from typing import Dict, Optional + +from django.utils.timezone import now + +from ee.clickhouse.models.property import get_property_string_expr +from ee.clickhouse.sql.groups import INSERT_GROUP_SQL +from ee.kafka_client.client import ClickhouseProducer +from ee.kafka_client.topics import KAFKA_GROUPS + + +def create_group( + team_id: int, + group_type_index: int, + group_key: str, + properties: Optional[Dict] = {}, + timestamp: Optional[datetime.datetime] = None, +): + if not timestamp: + timestamp = now() + + data = { + "group_type_index": group_type_index, + "group_key": group_key, + "team_id": team_id, + "group_properties": json.dumps(properties), + "created_at": timestamp.strftime("%Y-%m-%d %H:%M:%S.%f"), + "_timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), + } + p = ClickhouseProducer() + p.produce(topic=KAFKA_GROUPS, sql=INSERT_GROUP_SQL, data=data) + + +def get_aggregation_target_field( + aggregation_group_type_index: Optional[int], event_table_alias: str, distinct_id_table_alias: str +) -> str: + if aggregation_group_type_index is not None: + return f"{event_table_alias}.$group_{aggregation_group_type_index}" + else: + return f"{distinct_id_table_alias}.person_id" diff --git a/ee/clickhouse/models/person.py b/ee/clickhouse/models/person.py index 0e1768f504e4e..d8b82aedcbcdc 100644 --- a/ee/clickhouse/models/person.py +++ b/ee/clickhouse/models/person.py @@ -1,7 +1,7 @@ import datetime import json -from typing import Any, Dict, List, Optional -from uuid import UUID, uuid4 +from typing import Dict, List, Optional, Union +from uuid import UUID from django.db.models.query import QuerySet from django.db.models.signals import post_delete, post_save @@ -10,33 +10,19 @@ from rest_framework import serializers from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.property import parse_prop_clauses from ee.clickhouse.sql.person import ( DELETE_PERSON_BY_ID, - DELETE_PERSON_DISTINCT_ID_BY_PERSON_ID, DELETE_PERSON_EVENTS_BY_ID, - GET_DISTINCT_IDS_SQL, - GET_DISTINCT_IDS_SQL_BY_ID, - GET_PERSON_BY_DISTINCT_ID, - GET_PERSON_IDS_BY_FILTER, - GET_PERSON_SQL, INSERT_PERSON_DISTINCT_ID, INSERT_PERSON_SQL, - PERSON_DISTINCT_ID_EXISTS_SQL, - UPDATE_PERSON_ATTACHED_DISTINCT_ID, - UPDATE_PERSON_IS_IDENTIFIED, - UPDATE_PERSON_PROPERTIES, ) from ee.kafka_client.client import ClickhouseProducer from ee.kafka_client.topics import KAFKA_PERSON, KAFKA_PERSON_UNIQUE_ID -from posthog import settings -from posthog.ee import is_ee_enabled -from posthog.models.filter import Filter from posthog.models.person import Person, PersonDistinctId -from posthog.models.team import Team from posthog.models.utils import UUIDT +from posthog.utils import is_clickhouse_enabled -if settings.EE_AVAILABLE and is_ee_enabled(): +if is_clickhouse_enabled(): @receiver(post_save, sender=Person) def person_created(sender, instance: Person, created, **kwargs): @@ -49,11 +35,15 @@ def person_created(sender, instance: Person, created, **kwargs): @receiver(post_save, sender=PersonDistinctId) def person_distinct_id_created(sender, instance: PersonDistinctId, created, **kwargs): - create_person_distinct_id(instance.pk, instance.team.pk, instance.distinct_id, str(instance.person.uuid)) + create_person_distinct_id(instance.team.pk, instance.distinct_id, str(instance.person.uuid)) @receiver(post_delete, sender=Person) def person_deleted(sender, instance: Person, **kwargs): - delete_person(instance.uuid) + delete_person(instance.uuid, instance.properties, instance.is_identified, team_id=instance.team_id) + + @receiver(post_delete, sender=PersonDistinctId) + def person_distinct_id_deleted(sender, instance: PersonDistinctId, **kwargs): + create_person_distinct_id(instance.team.pk, instance.distinct_id, str(instance.person.uuid), sign=-1) def create_person( @@ -77,86 +67,106 @@ def create_person( "properties": json.dumps(properties), "is_identified": int(is_identified), "created_at": timestamp.strftime("%Y-%m-%d %H:%M:%S.%f"), + "_timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), } p = ClickhouseProducer() p.produce(topic=KAFKA_PERSON, sql=INSERT_PERSON_SQL, data=data, sync=sync) return uuid -def update_person_properties(team_id: int, id: str, properties: Dict) -> None: - sync_execute(UPDATE_PERSON_PROPERTIES, {"team_id": team_id, "id": id, "properties": json.dumps(properties)}) - - -def update_person_is_identified(team_id: int, id: str, is_identified: bool) -> None: - sync_execute( - UPDATE_PERSON_IS_IDENTIFIED, {"team_id": team_id, "id": id, "is_identified": "1" if is_identified else "0"} - ) - - -def create_person_distinct_id(id: int, team_id: int, distinct_id: str, person_id: str) -> None: - data = {"id": id, "distinct_id": distinct_id, "person_id": person_id, "team_id": team_id} +def create_person_distinct_id(team_id: int, distinct_id: str, person_id: str, sign=1) -> None: + data = {"distinct_id": distinct_id, "person_id": person_id, "team_id": team_id, "_sign": sign} p = ClickhouseProducer() p.produce(topic=KAFKA_PERSON_UNIQUE_ID, sql=INSERT_PERSON_DISTINCT_ID, data=data) -def distinct_ids_exist(team_id: int, ids: List[str]) -> bool: - return bool(sync_execute(PERSON_DISTINCT_ID_EXISTS_SQL.format([str(id) for id in ids]), {"team_id": team_id})[0][0]) - - -def get_persons(team_id: int): - result = sync_execute(GET_PERSON_SQL, {"team_id": team_id}) - return ClickhousePersonSerializer(result, many=True).data - - -def get_person_distinct_ids(team_id: int): - result = sync_execute(GET_DISTINCT_IDS_SQL, {"team_id": team_id}) - return ClickhousePersonDistinctIdSerializer(result, many=True).data - - -def get_person_by_distinct_id(team: Team, distinct_id: str, filter: Optional[Filter] = None) -> Dict[str, Any]: - params = {"team_id": team.pk, "distinct_id": distinct_id.__str__()} - filter_query = "" - if filter: - filter_query, filter_params = parse_prop_clauses(filter.properties, team.pk, table_name="pid") - params = {**params, **filter_params} - result = sync_execute(GET_PERSON_BY_DISTINCT_ID.format(distinct_query=filter_query, query=""), params) - if len(result) > 0: - return ClickhousePersonSerializer(result[0], many=False).data - return {} - - def get_persons_by_distinct_ids(team_id: int, distinct_ids: List[str]) -> QuerySet: return Person.objects.filter( team_id=team_id, persondistinctid__team_id=team_id, persondistinctid__distinct_id__in=distinct_ids ) -def merge_people(team_id: int, target: Dict, old_id: UUID, old_props: Dict) -> None: - # merge the properties - properties = {**old_props, **target["properties"]} +def get_persons_by_uuids(team_id: int, uuids: List[str]) -> QuerySet: + return Person.objects.filter(team_id=team_id, uuid__in=uuids) - update_person_properties(team_id=team_id, id=target["id"], properties=properties) - other_person_distinct_ids = sync_execute( - GET_DISTINCT_IDS_SQL_BY_ID, {"person_id": old_id, "team_id": target["team_id"]} - ) - - parsed_other_person_distinct_ids = ClickhousePersonDistinctIdSerializer(other_person_distinct_ids, many=True).data +def delete_person( + person_id: UUID, properties: Dict, is_identified: bool, delete_events: bool = False, team_id: int = False +) -> None: + timestamp = now() - for person_distinct_id in parsed_other_person_distinct_ids: - sync_execute( - UPDATE_PERSON_ATTACHED_DISTINCT_ID, - {"person_id": target["id"], "distinct_id": person_distinct_id["distinct_id"]}, - ) - delete_person(old_id) + data = { + "id": person_id, + "team_id": team_id, + "properties": json.dumps(properties), + "is_identified": int(is_identified), + "created_at": timestamp.strftime("%Y-%m-%d %H:%M:%S"), + "_timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"), + } + try: + if delete_events: + sync_execute(DELETE_PERSON_EVENTS_BY_ID, {"id": person_id, "team_id": team_id}) + except: + pass # cannot delete if the table is distributed + + sync_execute(DELETE_PERSON_BY_ID, data) + + +def count_duplicate_distinct_ids_for_team(team_id: Union[str, int]) -> Dict: + cutoff_date = (datetime.datetime.now() - datetime.timedelta(weeks=1)).strftime("%Y-%m-%d %H:%M:%S") + query_result = sync_execute( + """ + SELECT + count(if(startdate < toDate(%(cutoff_date)s), 1, NULL)) as prev_ids_with_duplicates, + minus(sum(if(startdate < toDate(%(cutoff_date)s), count, 0)), prev_ids_with_duplicates) as prev_total_extra_distinct_id_rows, + count(if(startdate >= toDate(%(cutoff_date)s), 1, NULL)) as new_ids_with_duplicates, + minus(sum(if(startdate >= toDate(%(cutoff_date)s), count, 0)), prev_ids_with_duplicates) as new_total_extra_distinct_id_rows + FROM ( + SELECT distinct_id, count(*) as count, toDate(min(timestamp)) as startdate + FROM ( + SELECT person_id, distinct_id, max(_timestamp) as timestamp + FROM person_distinct_id + WHERE team_id = %(team_id)s + GROUP BY person_id, distinct_id, team_id + HAVING max(is_deleted) = 0 + ) + GROUP BY distinct_id + HAVING count > 1 + ) as duplicates + """, + {"team_id": str(team_id), "cutoff_date": cutoff_date}, + ) -def delete_person(person_id: UUID, delete_events: bool = False, team_id: int = False) -> None: - if delete_events: - sync_execute(DELETE_PERSON_EVENTS_BY_ID, {"id": person_id, "team_id": team_id}) + result = { + "prev_total_ids_with_duplicates": query_result[0][0], + "prev_total_extra_distinct_id_rows": query_result[0][1], + "new_total_ids_with_duplicates": query_result[0][2], + "new_total_extra_distinct_id_rows": query_result[0][3], + } + return result + + +def count_total_persons_with_multiple_ids(team_id: Union[str, int], min_ids: int = 2): + query_result = sync_execute( + """ + SELECT count(*) as total_persons, max(_count) as max_distinct_ids_for_one_person FROM ( + SELECT person_id, count(distinct_id) as _count + FROM person_distinct_id + WHERE team_id = %(team_id)s + GROUP BY person_id, team_id + HAVING max(is_deleted) = 0 + ) + WHERE _count > %(min_ids)s + """, + {"team_id": str(team_id), "min_ids": str(min_ids)}, + ) - sync_execute(DELETE_PERSON_BY_ID, {"id": person_id,}) - sync_execute(DELETE_PERSON_DISTINCT_ID_BY_PERSON_ID, {"id": person_id,}) + result = { + f"total_persons_with_more_than_{min_ids}_ids": query_result[0][0], + "max_distinct_ids_for_one_person": query_result[0][1], + } + return result class ClickhousePersonSerializer(serializers.Serializer): @@ -191,22 +201,3 @@ def get_is_identified(self, person): # all queries might not retrieve distinct_ids def get_distinct_ids(self, person): return person[5] if len(person) > 5 else [] - - -class ClickhousePersonDistinctIdSerializer(serializers.Serializer): - id = serializers.SerializerMethodField() - distinct_id = serializers.SerializerMethodField() - person_id = serializers.SerializerMethodField() - team_id = serializers.SerializerMethodField() - - def get_id(self, pid): - return pid[0] - - def get_distinct_id(self, pid): - return pid[1] - - def get_person_id(self, pid): - return pid[2] - - def get_team_id(self, pid): - return pid[3] diff --git a/ee/clickhouse/models/property.py b/ee/clickhouse/models/property.py index 0ad8ce9dd8fd8..eccbbf058747c 100644 --- a/ee/clickhouse/models/property.py +++ b/ee/clickhouse/models/property.py @@ -1,140 +1,403 @@ -from typing import Any, Dict, List, Optional, Tuple +import re +from typing import ( + Any, + Callable, + Counter, + Dict, + List, + Optional, + Tuple, + cast, +) + +from django.utils import timezone +from rest_framework import exceptions from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.cohort import format_filter_query -from ee.clickhouse.models.util import is_int, is_json +from ee.clickhouse.materialized_columns.columns import TableWithProperties, get_materialized_columns +from ee.clickhouse.models.cohort import ( + format_filter_query, + format_precalculated_cohort_query, + format_static_cohort_query, +) +from ee.clickhouse.models.util import PersonPropertiesMode, is_json from ee.clickhouse.sql.events import SELECT_PROP_VALUES_SQL, SELECT_PROP_VALUES_SQL_WITH_FILTER -from ee.clickhouse.sql.person import GET_DISTINCT_IDS_BY_PROPERTY_SQL +from ee.clickhouse.sql.person import GET_DISTINCT_IDS_BY_PERSON_ID_FILTER, GET_DISTINCT_IDS_BY_PROPERTY_SQL from posthog.models.cohort import Cohort -from posthog.models.property import Property +from posthog.models.event import Selector +from posthog.models.property import ( + NEGATED_OPERATORS, + OperatorType, + Property, + PropertyIdentifier, + PropertyName, + PropertyType, +) from posthog.models.team import Team +from posthog.utils import is_valid_regex, relative_date_parse def parse_prop_clauses( - filters: List[Property], team_id: int, prepend: str = "", table_name: str = "" + filters: List[Property], + team_id: Optional[int], + prepend: str = "global", + table_name: str = "", + allow_denormalized_props: bool = True, + has_person_id_joined: bool = True, + person_properties_mode: PersonPropertiesMode = PersonPropertiesMode.USING_SUBQUERY, ) -> Tuple[str, Dict]: - final = "" - params: Dict[str, Any] = {"team_id": team_id} + final = [] + params: Dict[str, Any] = {} + if team_id is not None: + params["team_id"] = team_id if table_name != "": table_name += "." for idx, prop in enumerate(filters): if prop.type == "cohort": - cohort = Cohort.objects.get(pk=prop.value) - person_id_query, cohort_filter_params = format_filter_query(cohort) - params = {**params, **cohort_filter_params} - final += "AND {table_name}distinct_id IN ({clause}) ".format(table_name=table_name, clause=person_id_query) - elif prop.type == "person": - filter_query, filter_params = prop_filter_json_extract(prop, idx, "{}person".format(prepend)) - final += " AND {table_name}distinct_id IN ({filter_query})".format( - filter_query=GET_DISTINCT_IDS_BY_PROPERTY_SQL.format(filters=filter_query), table_name=table_name + try: + cohort = Cohort.objects.get(pk=prop.value, team_id=team_id) + except Cohort.DoesNotExist: + final.append("AND 0 = 13") # If cohort doesn't exist, nothing can match + else: + person_id_query, cohort_filter_params = format_filter_query(cohort, idx) + params = {**params, **cohort_filter_params} + final.append( + "AND {table_name}distinct_id IN ({clause})".format(table_name=table_name, clause=person_id_query) + ) + elif prop.type == "person" and person_properties_mode != PersonPropertiesMode.EXCLUDE: + # :TODO: Clean this up by using ClickhousePersonQuery over GET_DISTINCT_IDS_BY_PROPERTY_SQL to have access + # to materialized columns + # :TODO: (performance) Avoid subqueries whenever possible, use joins instead + is_direct_query = person_properties_mode == PersonPropertiesMode.USING_PERSON_PROPERTIES_COLUMN + filter_query, filter_params = prop_filter_json_extract( + prop, + idx, + "{}person".format(prepend), + prop_var="person_props" if is_direct_query else "properties", + allow_denormalized_props=allow_denormalized_props and is_direct_query, ) - params.update(filter_params) - else: + if is_direct_query: + final.append(filter_query) + params.update(filter_params) + else: + final.append( + "AND {table_name}distinct_id IN ({filter_query})".format( + filter_query=GET_DISTINCT_IDS_BY_PROPERTY_SQL.format(filters=filter_query), + table_name=table_name, + ) + ) + params.update(filter_params) + elif prop.type == "element": + query, filter_params = filter_element( + {prop.key: prop.value}, operator=prop.operator, prepend="{}_".format(idx) + ) + if query: + final.append(f" AND {query}") + params.update(filter_params) + elif prop.type == "event": filter_query, filter_params = prop_filter_json_extract( - prop, idx, prepend, prop_var="{}properties".format(table_name) + prop, + idx, + prepend, + prop_var="{}properties".format(table_name), + allow_denormalized_props=allow_denormalized_props, ) - final += " {filter_query} AND {table_name}team_id = %(team_id)s".format( - table_name=table_name, filter_query=filter_query + + final.append(f"{filter_query} AND {table_name}team_id = %(team_id)s" if team_id else filter_query) + params.update(filter_params) + elif prop.type == "group": + # :TRICKY: This assumes group properties have already been joined, as in trends query + filter_query, filter_params = prop_filter_json_extract( + prop, idx, prepend, prop_var=f"group_properties_{prop.group_type_index}", allow_denormalized_props=False ) + + final.append(filter_query) + params.update(filter_params) + elif prop.type in ("static-cohort", "precalculated-cohort"): + cohort_id = cast(int, prop.value) + + method = format_static_cohort_query if prop.type == "static-cohort" else format_precalculated_cohort_query + filter_query, filter_params = method(cohort_id, idx, prepend=prepend, custom_match_field="person_id") # type: ignore + if has_person_id_joined: + final.append(f" AND {filter_query}") + else: + # :TODO: (performance) Avoid subqueries whenever possible, use joins instead + subquery = GET_DISTINCT_IDS_BY_PERSON_ID_FILTER.format(filters=filter_query) + final.append(f"AND {table_name}distinct_id IN ({subquery})") params.update(filter_params) - return final, params + + return " ".join(final), params def prop_filter_json_extract( - prop: Property, idx: int, prepend: str = "", prop_var: str = "properties" + prop: Property, + idx: int, + prepend: str = "", + prop_var: str = "properties", + allow_denormalized_props: bool = True, + transform_expression: Optional[Callable[[str], str]] = None, ) -> Tuple[str, Dict[str, Any]]: + # TODO: Once all queries are migrated over we can get rid of allow_denormalized_props + if transform_expression is not None: + prop_var = transform_expression(prop_var) + + property_expr, is_denormalized = get_property_string_expr( + property_table(prop), prop.key, f"%(k{prepend}_{idx})s", prop_var, allow_denormalized_props + ) + + if is_denormalized and transform_expression: + property_expr = transform_expression(property_expr) + operator = prop.operator + params: Dict[str, Any] = {} + if operator == "is_not": - params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} + params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): box_value(prop.value)} return ( - "AND NOT (trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s)) = %(v{prepend}_{idx})s)".format( - idx=idx, prepend=prepend, prop_var=prop_var - ), + "AND NOT has(%(v{prepend}_{idx})s, {left})".format(idx=idx, prepend=prepend, left=property_expr), params, ) elif operator == "icontains": value = "%{}%".format(prop.value) params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): value} return ( - "AND trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s)) LIKE %(v{prepend}_{idx})s".format( - idx=idx, prepend=prepend, prop_var=prop_var - ), + "AND {left} ILIKE %(v{prepend}_{idx})s".format(idx=idx, prepend=prepend, left=property_expr), params, ) elif operator == "not_icontains": value = "%{}%".format(prop.value) params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): value} return ( - "AND NOT (trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s)) LIKE %(v{prepend}_{idx})s)".format( - idx=idx, prepend=prepend, prop_var=prop_var - ), + "AND NOT ({left} ILIKE %(v{prepend}_{idx})s)".format(idx=idx, prepend=prepend, left=property_expr), params, ) - elif operator == "regex": - params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} - return ( - "AND match(trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s)), %(v{prepend}_{idx})s)".format( - idx=idx, prepend=prepend, prop_var=prop_var - ), - params, - ) - elif operator == "not_regex": + elif operator in ("regex", "not_regex"): + if not is_valid_regex(prop.value): + return "AND 1 = 2", {} + params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} + return ( - "AND NOT match(trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s)), %(v{prepend}_{idx})s)".format( - idx=idx, prepend=prepend, prop_var=prop_var + "AND {regex_function}({left}, %(v{prepend}_{idx})s)".format( + regex_function="match" if operator == "regex" else "NOT match", + idx=idx, + prepend=prepend, + left=property_expr, ), params, ) elif operator == "is_set": params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} + if is_denormalized: + return ( + "AND notEmpty({left})".format(left=property_expr), + params, + ) return ( "AND JSONHas({prop_var}, %(k{prepend}_{idx})s)".format(idx=idx, prepend=prepend, prop_var=prop_var), params, ) elif operator == "is_not_set": params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} + if is_denormalized: + return ( + "AND empty({left})".format(left=property_expr), + params, + ) return ( - "AND (isNull(trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s))) OR NOT JSONHas({prop_var}, %(k{prepend}_{idx})s))".format( - idx=idx, prepend=prepend, prop_var=prop_var + "AND (isNull({left}) OR NOT JSONHas({prop_var}, %(k{prepend}_{idx})s))".format( + idx=idx, prepend=prepend, prop_var=prop_var, left=property_expr ), params, ) elif operator == "gt": params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} return ( - "AND toInt64OrNull(replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s), ' ', '')) > %(v{prepend}_{idx})s".format( - idx=idx, prepend=prepend, prop_var=prop_var + "AND toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) > %(v{prepend}_{idx})s".format( + idx=idx, prepend=prepend, left=property_expr, ), params, ) elif operator == "lt": params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} return ( - "AND toInt64OrNull(replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s), ' ', '')) < %(v{prepend}_{idx})s".format( - idx=idx, prepend=prepend, prop_var=prop_var + "AND toFloat64OrNull(trim(BOTH '\"' FROM replaceRegexpAll({left}, ' ', ''))) < %(v{prepend}_{idx})s".format( + idx=idx, prepend=prepend, left=property_expr, ), params, ) else: - if is_int(prop.value): - clause = "AND JSONExtractInt({prop_var}, %(k{prepend}_{idx})s) = %(v{prepend}_{idx})s" - elif is_json(prop.value): - clause = "AND replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s),' ', '') = replaceRegexpAll(toString(%(v{prepend}_{idx})s),' ', '')" + if is_json(prop.value) and not is_denormalized: + clause = "AND has(%(v{prepend}_{idx})s, replaceRegexpAll(visitParamExtractRaw({prop_var}, %(k{prepend}_{idx})s),' ', ''))" + params = { + "k{}_{}".format(prepend, idx): prop.key, + "v{}_{}".format(prepend, idx): box_value(prop.value, remove_spaces=True), + } else: - clause = "AND trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, %(k{prepend}_{idx})s)) = %(v{prepend}_{idx})s" - - params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): prop.value} + clause = "AND has(%(v{prepend}_{idx})s, {left})" + params = {"k{}_{}".format(prepend, idx): prop.key, "v{}_{}".format(prepend, idx): box_value(prop.value)} return ( - clause.format(idx=idx, prepend=prepend, prop_var=prop_var), + clause.format(left=property_expr, idx=idx, prepend=prepend, prop_var=prop_var), params, ) +def property_table(property: Property) -> TableWithProperties: + if property.type == "event": + return "events" + elif property.type == "person": + return "person" + elif property.type == "group": + return "groups" + else: + raise ValueError(f"Property type does not have a table: {property.type}") + + +def get_property_string_expr( + table: TableWithProperties, + property_name: PropertyName, + var: str, + prop_var: str, + allow_denormalized_props: bool = True, +) -> Tuple[str, bool]: + materialized_columns = get_materialized_columns(table) if allow_denormalized_props else {} + + if allow_denormalized_props and property_name in materialized_columns: + return materialized_columns[property_name], True + + return f"trim(BOTH '\"' FROM JSONExtractRaw({prop_var}, {var}))", False + + +def box_value(value: Any, remove_spaces=False) -> List[Any]: + if not isinstance(value, List): + value = [value] + return [str(value).replace(" ", "") if remove_spaces else str(value) for value in value] + + def get_property_values_for_key(key: str, team: Team, value: Optional[str] = None): + + parsed_date_from = "AND timestamp >= '{}'".format(relative_date_parse("-7d").strftime("%Y-%m-%d 00:00:00")) + parsed_date_to = "AND timestamp <= '{}'".format(timezone.now().strftime("%Y-%m-%d 23:59:59")) + if value: return sync_execute( - SELECT_PROP_VALUES_SQL_WITH_FILTER, {"team_id": team.pk, "key": key, "value": "%{}%".format(value)}, + SELECT_PROP_VALUES_SQL_WITH_FILTER.format(parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to), + {"team_id": team.pk, "key": key, "value": "%{}%".format(value)}, ) - return sync_execute(SELECT_PROP_VALUES_SQL, {"team_id": team.pk, "key": key}) + return sync_execute( + SELECT_PROP_VALUES_SQL.format(parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to), + {"team_id": team.pk, "key": key}, + ) + + +def filter_element(filters: Dict, *, operator: Optional[OperatorType] = None, prepend: str = "") -> Tuple[str, Dict]: + if not operator: + operator = "exact" + + params = {} + final_conditions = [] + + if filters.get("selector") is not None: + if operator not in ("exact", "is_not"): + raise exceptions.ValidationError( + 'Filtering by element selector only supports operators "equals" and "doesn\'t equal" currently.' + ) + selectors = filters["selector"] if isinstance(filters["selector"], list) else [filters["selector"]] + if selectors: + combination_conditions = [] + for idx, query in enumerate(selectors): + if not query: # Skip empty selectors + continue + selector = Selector(query, escape_slashes=False) + key = f"{prepend}_{idx}_selector_regex" + params[key] = build_selector_regex(selector) + combination_conditions.append(f"match(elements_chain, %({key})s)") + if combination_conditions: + final_conditions.append(f"({' OR '.join(combination_conditions)})") + elif operator not in NEGATED_OPERATORS: + # If a non-negated filter has an empty selector list provided, it can't match anything + return "0 = 191", {} + + if filters.get("tag_name") is not None: + if operator not in ("exact", "is_not"): + raise exceptions.ValidationError( + 'Filtering by element tag only supports operators "equals" and "doesn\'t equal" currently.' + ) + tag_names = filters["tag_name"] if isinstance(filters["tag_name"], list) else [filters["tag_name"]] + if tag_names: + combination_conditions = [] + for idx, tag_name in enumerate(tag_names): + key = f"{prepend}_{idx}_tag_name_regex" + params[key] = rf"(^|;){tag_name}(\.|$|;|:)" + combination_conditions.append(f"match(elements_chain, %({key})s)") + final_conditions.append(f"({' OR '.join(combination_conditions)})") + elif operator not in NEGATED_OPERATORS: + # If a non-negated filter has an empty tag_name list provided, it can't match anything + return "0 = 192", {} + + attributes: Dict[str, List] = {} + for key in ["href", "text"]: + if filters.get(key) is not None: + attributes[key] = process_ok_values(filters[key], operator) + if attributes: + for key, ok_values in attributes.items(): + if ok_values: + combination_conditions = [] + for idx, value in enumerate(ok_values): + optional_flag = "(?i)" if operator.endswith("icontains") else "" + params[f"{prepend}_{key}_{idx}_attributes_regex"] = f'{optional_flag}({key}="{value}")' + combination_conditions.append(f"match(elements_chain, %({prepend}_{key}_{idx}_attributes_regex)s)") + final_conditions.append(f"({' OR '.join(combination_conditions)})") + elif operator not in NEGATED_OPERATORS: + # If a non-negated filter has an empty href or text list provided, it can't match anything + return "0 = 193", {} + + if final_conditions: + return f"{'NOT ' if operator in NEGATED_OPERATORS else ''}({' AND '.join(final_conditions)})", params + else: + return "", {} + + +def process_ok_values(ok_values: Any, operator: OperatorType) -> List[str]: + if operator.endswith("_set"): + return [r'[^"]+'] + else: + # Make sure ok_values is a list + ok_values = cast(List[str], [str(val) for val in ok_values]) if isinstance(ok_values, list) else [ok_values] + # Escape double quote characters, since e.g. text 'foo="bar"' is represented as text="foo=\"bar\"" + # in the elements chain + ok_values = [text.replace('"', r"\"") for text in ok_values] + if operator.endswith("icontains"): + # Process values for case-insensitive-contains matching by way of regex, + # making sure matching scope is limited to between double quotes + return [rf'[^"]*{re.escape(text)}[^"]*' for text in ok_values] + if operator.endswith("regex"): + # Use values as-is in case of regex matching + return ok_values + # For all other operators escape regex-meaningful sequences + return [re.escape(text) for text in ok_values] + + +def build_selector_regex(selector: Selector) -> str: + regex = r"" + for tag in selector.parts: + if tag.data.get("tag_name") and isinstance(tag.data["tag_name"], str): + if tag.data["tag_name"] == "*": + regex += ".+" + else: + regex += tag.data["tag_name"] + if tag.data.get("attr_class__contains"): + regex += r".*?\.{}".format(r"\..*?".join(sorted(tag.data["attr_class__contains"]))) + if tag.ch_attributes: + regex += ".*?" + for key, value in sorted(tag.ch_attributes.items()): + regex += '{}="{}".*?'.format(key, value) + regex += r"([-_a-zA-Z0-9\.]*?)?($|;|:([^;^\s]*(;|$|\s)))" + if tag.direct_descendant: + regex += ".*" + return regex + + +def extract_tables_and_properties(props: List[Property]) -> Counter[PropertyIdentifier]: + return Counter((prop.key, prop.type, prop.group_type_index) for prop in props) diff --git a/ee/clickhouse/models/session_recording_event.py b/ee/clickhouse/models/session_recording_event.py index bb786d9987cc6..5277f387dd4c1 100644 --- a/ee/clickhouse/models/session_recording_event.py +++ b/ee/clickhouse/models/session_recording_event.py @@ -1,36 +1,51 @@ import datetime import json +import logging import uuid -from typing import Dict, List, Optional, Tuple, Union +from typing import Union -from dateutil.parser import isoparse -from django.utils import timezone +from sentry_sdk import capture_exception +from ee.clickhouse.client import sync_execute from ee.clickhouse.models.util import cast_timestamp_or_now from ee.clickhouse.sql.session_recording_events import INSERT_SESSION_RECORDING_EVENT_SQL from ee.kafka_client.client import ClickhouseProducer from ee.kafka_client.topics import KAFKA_SESSION_RECORDING_EVENTS +logger = logging.getLogger(__name__) + +MAX_KAFKA_MESSAGE_LENGTH = 800_000 +MAX_INSERT_LENGTH = 15_000_000 + def create_session_recording_event( uuid: uuid.UUID, team_id: int, distinct_id: str, session_id: str, + window_id: str, timestamp: Union[datetime.datetime, str], snapshot_data: dict, ) -> str: timestamp = cast_timestamp_or_now(timestamp) + snapshot_data_json = json.dumps(snapshot_data) data = { "uuid": str(uuid), "team_id": team_id, "distinct_id": distinct_id, "session_id": session_id, - "snapshot_data": json.dumps(snapshot_data), + "window_id": window_id, + "snapshot_data": snapshot_data_json, "timestamp": timestamp, "created_at": timestamp, } - p = ClickhouseProducer() - p.produce(sql=INSERT_SESSION_RECORDING_EVENT_SQL, topic=KAFKA_SESSION_RECORDING_EVENTS, data=data) + if len(snapshot_data_json) <= MAX_KAFKA_MESSAGE_LENGTH: + p = ClickhouseProducer() + p.produce(sql=INSERT_SESSION_RECORDING_EVENT_SQL, topic=KAFKA_SESSION_RECORDING_EVENTS, data=data) + elif len(snapshot_data_json) <= MAX_INSERT_LENGTH: + sync_execute(INSERT_SESSION_RECORDING_EVENT_SQL, data, settings={"max_query_size": MAX_INSERT_LENGTH}) + else: + capture_exception(Exception(f"Session recording event data too large - {len(snapshot_data_json)}")) + return str(uuid) diff --git a/ee/clickhouse/models/test/__snapshots__/test_property.ambr b/ee/clickhouse/models/test/__snapshots__/test_property.ambr new file mode 100644 index 0000000000000..d7899bf60bec4 --- /dev/null +++ b/ee/clickhouse/models/test/__snapshots__/test_property.ambr @@ -0,0 +1,66 @@ +# name: test_parse_prop_clauses_defaults + ( + ' + AND has(%(vglobal_0)s, trim(BOTH '"' FROM JSONExtractRaw(properties, %(kglobal_0)s))) AND distinct_id IN ( + SELECT distinct_id + FROM ( + + SELECT distinct_id, argMax(person_id, _timestamp) as person_id + FROM ( + SELECT distinct_id, person_id, max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = %(team_id)s + GROUP BY person_id, distinct_id, team_id + HAVING max(is_deleted) = 0 + ) + GROUP BY distinct_id + + ) + WHERE person_id IN + ( + SELECT id + FROM ( + SELECT id, argMax(properties, person._timestamp) as properties, max(is_deleted) as is_deleted + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING is_deleted = 0 + ) + WHERE 1 = 1 AND trim(BOTH '"' FROM JSONExtractRaw(properties, %(kglobalperson_1)s)) ILIKE %(vglobalperson_1)s + ) + ) + ', + { + 'kglobal_0': 'event_prop', + 'kglobalperson_1': 'email', + 'vglobal_0': [ + 'value', + ], + 'vglobalperson_1': '%posthog%', + }, + ) +--- +# name: test_parse_prop_clauses_defaults.1 + ( + 'AND has(%(vglobal_0)s, trim(BOTH \'"\' FROM JSONExtractRaw(properties, %(kglobal_0)s))) AND trim(BOTH \'"\' FROM JSONExtractRaw(person_props, %(kglobalperson_1)s)) ILIKE %(vglobalperson_1)s', + { + 'kglobal_0': 'event_prop', + 'kglobalperson_1': 'email', + 'vglobal_0': [ + 'value', + ], + 'vglobalperson_1': '%posthog%', + }, + ) +--- +# name: test_parse_prop_clauses_defaults.2 + ( + 'AND has(%(vglobal_0)s, trim(BOTH \'"\' FROM JSONExtractRaw(properties, %(kglobal_0)s)))', + { + 'kglobal_0': 'event_prop', + 'vglobal_0': [ + 'value', + ], + }, + ) +--- diff --git a/ee/clickhouse/models/test/test_action.py b/ee/clickhouse/models/test/test_action.py index bc9b00b754de2..a9c7a43ea1a57 100644 --- a/ee/clickhouse/models/test/test_action.py +++ b/ee/clickhouse/models/test/test_action.py @@ -7,11 +7,11 @@ from ee.clickhouse.models.event import create_event from ee.clickhouse.sql.actions import ACTION_QUERY from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.base import BaseTest from posthog.models.action import Action from posthog.models.action_step import ActionStep from posthog.models.event import Event from posthog.models.person import Person +from posthog.test.base import BaseTest from posthog.test.test_event_model import filter_by_actions_factory @@ -23,7 +23,7 @@ def _create_event(**kwargs) -> Event: def query_action(action: Action) -> Optional[List]: - formatted_query, params = format_action_filter(action, "", 0) + formatted_query, params = format_action_filter(action, "") query = ACTION_QUERY.format(action_filter=formatted_query) @@ -50,6 +50,9 @@ def _create_person(**kwargs) -> Person: return Person(id=person.uuid) +EVENT_UUID_QUERY = "SELECT uuid FROM events WHERE {} AND team_id = %(team_id)s" + + class TestActions( ClickhouseTestMixin, filter_by_actions_factory(_create_event, _create_person, _get_events_for_action) # type: ignore ): @@ -86,13 +89,13 @@ def test_filter_event_exact_url(self): ) query, params = filter_event(step1) - full_query = "SELECT uuid FROM events WHERE {}".format(" AND ".join(query)) + full_query = EVENT_UUID_QUERY.format(" AND ".join(query)) result = sync_execute(full_query, {**params, "team_id": self.team.pk}) self.assertEqual(str(result[0][0]), event_target.pk) def test_filter_event_contains_url(self): - event_target = _create_event( + _create_event( event="$autocapture", team=self.team, distinct_id="whatever", @@ -117,13 +120,13 @@ def test_filter_event_contains_url(self): step1 = ActionStep.objects.create(event="$autocapture", action=action1, url="https://posthog.com/feedback/123",) query, params = filter_event(step1) - full_query = "SELECT uuid FROM events WHERE {}".format(" AND ".join(query)) + full_query = EVENT_UUID_QUERY.format(" AND ".join(query)) result = sync_execute(full_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 2) def test_filter_event_regex_url(self): - event_target = _create_event( + _create_event( event="$autocapture", team=self.team, distinct_id="whatever", @@ -150,6 +153,27 @@ def test_filter_event_regex_url(self): ) query, params = filter_event(step1) - full_query = "SELECT uuid FROM events WHERE {}".format(" AND ".join(query)) + full_query = EVENT_UUID_QUERY.format(" AND ".join(query)) result = sync_execute(full_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 2) + + def test_double(self): + # Tests a regression where the second step properties would override those of the first step, causing issues + _create_event( + event="insight viewed", team=self.team, distinct_id="whatever", properties={"filters_count": 2}, + ) + + action1 = Action.objects.create(team=self.team, name="action1") + step1 = ActionStep.objects.create( + event="insight viewed", + action=action1, + properties=[{"key": "insight", "type": "event", "value": ["RETENTION"], "operator": "exact"}], + ) + step2 = ActionStep.objects.create( + event="insight viewed", + action=action1, + properties=[{"key": "filters_count", "type": "event", "value": "1", "operator": "gt"}], + ) + + events = query_action(action1) + self.assertEqual(len(events), 1) # type: ignore diff --git a/ee/clickhouse/models/test/test_cohort.py b/ee/clickhouse/models/test/test_cohort.py index 9e91603571797..55b929e15d024 100644 --- a/ee/clickhouse/models/test/test_cohort.py +++ b/ee/clickhouse/models/test/test_cohort.py @@ -1,21 +1,28 @@ from datetime import datetime +from typing import List, Tuple, cast from uuid import uuid4 +import pytest +import sqlparse +from django.utils import timezone +from freezegun import freeze_time + from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.cohort import format_filter_query, format_person_query, get_person_ids_by_cohort_id +from ee.clickhouse.models.cohort import format_filter_query, get_person_ids_by_cohort_id, recalculate_cohortpeople from ee.clickhouse.models.event import create_event from ee.clickhouse.models.person import create_person, create_person_distinct_id from ee.clickhouse.models.property import parse_prop_clauses from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.base import BaseTest from posthog.models.action import Action from posthog.models.action_step import ActionStep from posthog.models.cohort import Cohort from posthog.models.event import Event -from posthog.models.filter import Filter +from posthog.models.filters import Filter +from posthog.models.organization import Organization from posthog.models.person import Person from posthog.models.team import Team from posthog.models.utils import UUIDT +from posthog.test.base import BaseTest def _create_event(**kwargs) -> Event: @@ -42,7 +49,7 @@ def _create_person(**kwargs) -> Person: distinct_ids = kwargs.pop("distinct_ids") person = create_person(uuid=uuid, **kwargs) for id in distinct_ids: - create_person_distinct_id(0, kwargs["team_id"], id, str(person)) + create_person_distinct_id(kwargs["team_id"], id, str(person)) return Person(id=person, uuid=person) @@ -99,12 +106,115 @@ def test_prop_cohort_basic_action(self): cohort1 = Cohort.objects.create(team=self.team, groups=[{"action_id": action.pk}], name="cohort1",) - filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}) + filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team) query, params = parse_prop_clauses(filter.properties, self.team.pk) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 1) + def test_prop_cohort_basic_event_days(self): + + _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) + + _create_person( + distinct_ids=["some_id"], + team_id=self.team.pk, + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="some_id", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="some_other_id", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 5, 12, 0, 1), + ) + + with freeze_time("2020-01-10"): + cohort1 = Cohort.objects.create( + team=self.team, groups=[{"event_id": "$pageview", "days": 1}], name="cohort1", + ) + + filter = Filter( + data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team + ) + query, params = parse_prop_clauses(filter.properties, self.team.pk) + final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) + result = sync_execute(final_query, {**params, "team_id": self.team.pk}) + self.assertEqual(len(result), 1) + + cohort2 = Cohort.objects.create( + team=self.team, groups=[{"event_id": "$pageview", "days": 7}], name="cohort2", + ) + + filter = Filter( + data={"properties": [{"key": "id", "value": cohort2.pk, "type": "cohort"}],}, team=self.team + ) + query, params = parse_prop_clauses(filter.properties, self.team.pk) + final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) + result = sync_execute(final_query, {**params, "team_id": self.team.pk}) + self.assertEqual(len(result), 2) + + def test_prop_cohort_basic_action_days(self): + + _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) + + _create_person( + distinct_ids=["some_id"], + team_id=self.team.pk, + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + action = _create_action(team=self.team, name="$pageview") + _create_event( + event="$pageview", + team=self.team, + distinct_id="some_id", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="some_other_id", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 5, 12, 0, 1), + ) + + with freeze_time("2020-01-10"): + cohort1 = Cohort.objects.create( + team=self.team, groups=[{"action_id": action.pk, "days": 1}], name="cohort1", + ) + + filter = Filter( + data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team + ) + query, params = parse_prop_clauses(filter.properties, self.team.pk) + final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) + result = sync_execute(final_query, {**params, "team_id": self.team.pk}) + self.assertEqual(len(result), 1) + + cohort2 = Cohort.objects.create( + team=self.team, groups=[{"action_id": action.pk, "days": 7}], name="cohort2", + ) + + filter = Filter( + data={"properties": [{"key": "id", "value": cohort2.pk, "type": "cohort"}],}, team=self.team + ) + query, params = parse_prop_clauses(filter.properties, self.team.pk) + final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) + result = sync_execute(final_query, {**params, "team_id": self.team.pk}) + self.assertEqual(len(result), 2) + def test_prop_cohort_multiple_groups(self): _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) @@ -124,14 +234,14 @@ def test_prop_cohort_multiple_groups(self): name="cohort1", ) - filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}) + filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team) query, params = parse_prop_clauses(filter.properties, self.team.pk) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 2) def test_prop_cohort_with_negation(self): - team2 = Team.objects.create() + team2 = Organization.objects.bootstrap(None)[2] _create_person(distinct_ids=["some_other_id"], team_id=self.team.pk, properties={"$some_prop": "something"}) @@ -145,50 +255,486 @@ def test_prop_cohort_with_negation(self): ) cohort1 = Cohort.objects.create( - team=self.team, groups=[{"properties": {"$some_prop__is_not": "something"}}], name="cohort1", + team=self.team, + groups=[ + {"properties": [{"type": "person", "key": "$some_prop", "operator": "is_not", "value": "something"}]} + ], + name="cohort1", ) - filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}) + filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team) query, params = parse_prop_clauses(filter.properties, self.team.pk) final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) result = sync_execute(final_query, {**params, "team_id": self.team.pk}) self.assertEqual(len(result), 0) - def test_cohort_updated_props(self): - # The way clickhouse works is that updates aren't instant, so two people with the same ID are in the database - # Make sure we get the last one. - person1 = _create_person( - distinct_ids=["some_other_id_2"], + def test_cohort_get_person_ids_by_cohort_id(self): + user1 = _create_person(distinct_ids=["user1"], team_id=self.team.pk, properties={"$some_prop": "something"}) + user2 = _create_person(distinct_ids=["user2"], team_id=self.team.pk, properties={"$some_prop": "another"}) + user3 = _create_person(distinct_ids=["user3"], team_id=self.team.pk, properties={"$some_prop": "something"}) + cohort = Cohort.objects.create( + team=self.team, groups=[{"properties": {"$some_prop": "something"}}], name="cohort1", + ) + + results = get_person_ids_by_cohort_id(self.team, cohort.id) + self.assertEqual(len(results), 2) + self.assertIn(user1.uuid, results) + self.assertIn(user3.uuid, results) + + def test_insert_by_distinct_id_or_email(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["1"]) + Person.objects.create(team_id=self.team.pk, distinct_ids=["123"]) + Person.objects.create(team_id=self.team.pk, distinct_ids=["2"]) + # Team leakage + team2 = Team.objects.create(organization=self.organization) + Person.objects.create(team=team2, distinct_ids=["1"]) + + cohort = Cohort.objects.create(team=self.team, groups=[], is_static=True) + cohort.insert_users_by_list(["1", "123"]) + cohort = Cohort.objects.get() + results = get_person_ids_by_cohort_id(self.team, cohort.id) + self.assertEqual(len(results), 2) + self.assertEqual(cohort.is_calculating, False) + + # test SQLi + Person.objects.create(team_id=self.team.pk, distinct_ids=["'); truncate person_static_cohort; --"]) + cohort.insert_users_by_list(["'); truncate person_static_cohort; --", "123"]) + results = sync_execute( + "select count(1) from person_static_cohort where team_id = %(team_id)s", {"team_id": self.team.pk} + )[0][0] + self.assertEqual(results, 3) + + #  If we accidentally call calculate_people it shouldn't erase people + cohort.calculate_people() + results = get_person_ids_by_cohort_id(self.team, cohort.id) + self.assertEqual(len(results), 3) + + # if we add people again, don't increase the number of people in cohort + cohort.insert_users_by_list(["123"]) + results = get_person_ids_by_cohort_id(self.team, cohort.id) + self.assertEqual(len(results), 3) + + def test_cohortpeople_basic(self): + p1 = Person.objects.create( team_id=self.team.pk, - properties={"$some_prop": "updated"}, - timestamp=datetime(2020, 1, 1, 12, 0, 1), + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, ) - _create_person( - uuid=person1.uuid, - distinct_ids=["some_other_id"], + p2 = Person.objects.create( team_id=self.team.pk, - properties={"$some_prop": "something"}, - timestamp=datetime(2020, 1, 1, 12, 0, 4), + distinct_ids=["2"], + properties={"$some_prop": "something", "$another_prop": "something"}, ) cohort1 = Cohort.objects.create( - team=self.team, groups=[{"properties": {"$some_prop": "updated"}}], name="cohort1", + team=self.team, + groups=[{"properties": {"$some_prop": "something", "$another_prop": "something"}}], + name="cohort1", ) - final_query, params = format_filter_query(cohort1) + cohort1.calculate_people_ch() - result = sync_execute(final_query, {**params, "team_id": self.team.pk}) - self.assertEqual(len(result), 0) + results = sync_execute( + "SELECT person_id FROM cohortpeople WHERE team_id = %(team_id)s", {"team_id": self.team.pk} + ) + self.assertEqual(len(results), 2) - def test_cohort_get_person_ids_by_cohort_id(self): - user1 = _create_person(distinct_ids=["user1"], team_id=self.team.pk, properties={"$some_prop": "something"}) - user2 = _create_person(distinct_ids=["user2"], team_id=self.team.pk, properties={"$some_prop": "another"}) - user3 = _create_person(distinct_ids=["user3"], team_id=self.team.pk, properties={"$some_prop": "something"}) - cohort = Cohort.objects.create( - team=self.team, groups=[{"properties": {"$some_prop": "something"}}], name="cohort1", + def test_cohortpeople_action_basic(self): + action = _create_action(team=self.team, name="$pageview") + p1 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, ) - results = get_person_ids_by_cohort_id(self.team, cohort.id) + _create_event( + event="$pageview", + team=self.team, + distinct_id="1", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + p2 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="2", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + cohort1 = Cohort.objects.create(team=self.team, groups=[{"action_id": action.pk, "days": 1}], name="cohort1",) + with freeze_time("2020-01-10"): + cohort1.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople WHERE cohort_id = %(cohort_id)s", {"cohort_id": cohort1.pk} + ) self.assertEqual(len(results), 2) - self.assertIn(user1.uuid, results) - self.assertIn(user3.uuid, results) + + cohort2 = Cohort.objects.create(team=self.team, groups=[{"action_id": action.pk, "days": 1}], name="cohort2",) + with freeze_time("2020-01-10"): + cohort2.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople WHERE cohort_id = %(cohort_id)s", {"cohort_id": cohort2.pk} + ) + self.assertEqual(len(results), 2) + + def test_cohortpeople_timestamp(self): + action = _create_action(team=self.team, name="$pageview") + p1 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="1", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + p2 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="2", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 7, 12, 0, 1), + ) + + cohort1 = Cohort.objects.create( + team=self.team, + groups=[{"action_id": action.pk, "start_date": datetime(2020, 1, 8, 12, 0, 1)}], + name="cohort1", + ) + with freeze_time("2020-01-10"): + cohort1.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople where team_id = %(team_id)s", {"team_id": self.team.pk} + ) + self.assertEqual(len(results), 1) + + def _setup_actions_with_different_counts(self): + action = _create_action(team=self.team, name="$pageview") + p1 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="1", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 8, 12, 0, 1), + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="1", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + p2 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="2", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 8, 12, 0, 1), + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="2", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + + p3 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["3"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + _create_event( + event="$pageview", + team=self.team, + distinct_id="3", + properties={"attr": "some_val"}, + timestamp=datetime(2020, 1, 9, 12, 0, 1), + ) + return action + + def test_cohortpeople_action_count(self): + + action = self._setup_actions_with_different_counts() + + # test operators + cohort1 = Cohort.objects.create( + team=self.team, + groups=[{"action_id": action.pk, "days": 3, "count": 2, "count_operator": "gte"}], + name="cohort1", + ) + with freeze_time("2020-01-10"): + cohort1.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople where cohort_id = %(cohort_id)s", {"cohort_id": cohort1.pk} + ) + self.assertEqual(len(results), 2) + + cohort2 = Cohort.objects.create( + team=self.team, + groups=[{"action_id": action.pk, "days": 3, "count": 1, "count_operator": "lte"}], + name="cohort2", + ) + with freeze_time("2020-01-10"): + cohort2.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople where cohort_id = %(cohort_id)s", {"cohort_id": cohort2.pk} + ) + self.assertEqual(len(results), 1) + + cohort3 = Cohort.objects.create( + team=self.team, + groups=[{"action_id": action.pk, "days": 3, "count": 1, "count_operator": "eq"}], + name="cohort3", + ) + with freeze_time("2020-01-10"): + cohort3.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople where cohort_id = %(cohort_id)s", {"cohort_id": cohort3.pk} + ) + self.assertEqual(len(results), 1) + + def test_cohortpeople_deleted_person(self): + p1 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + p2 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + cohort1 = Cohort.objects.create( + team=self.team, + groups=[{"properties": {"$some_prop": "something", "$another_prop": "something"}}], + name="cohort1", + ) + + cohort1.calculate_people_ch() + p2.delete() + cohort1.calculate_people_ch() + + def test_cohortpeople_prop_changed(self): + with freeze_time("2020-01-10"): + p1 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + p2 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + + cohort1 = Cohort.objects.create( + team=self.team, + groups=[{"properties": {"$some_prop": "something", "$another_prop": "something"}}], + name="cohort1", + ) + + cohort1.calculate_people_ch() + + with freeze_time("2020-01-11"): + p2.properties = {"$some_prop": "another", "$another_prop": "another"} + p2.save() + + cohort1.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople WHERE team_id = %(team_id)s GROUP BY person_id, team_id, cohort_id HAVING sum(sign) > 0", + {"team_id": self.team.pk}, + ) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0][0], p1.uuid) + + def test_cohort_change(self): + with freeze_time("2020-01-10"): + p1 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["1"], + properties={"$some_prop": "something", "$another_prop": "something"}, + ) + p2 = Person.objects.create( + team_id=self.team.pk, + distinct_ids=["2"], + properties={"$some_prop": "another", "$another_prop": "another"}, + ) + + cohort1 = Cohort.objects.create( + team=self.team, + groups=[{"properties": {"$some_prop": "something", "$another_prop": "something"}}], + name="cohort1", + ) + cohort1.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople WHERE team_id = %(team_id)s GROUP BY person_id, team_id, cohort_id HAVING sum(sign) > 0", + {"team_id": self.team.pk}, + ) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0][0], p1.uuid) + + with freeze_time("2020-01-11"): + cohort1.groups = [{"properties": {"$some_prop": "another", "$another_prop": "another"}}] + cohort1.save() + cohort1.calculate_people_ch() + + results = sync_execute( + "SELECT person_id FROM cohortpeople WHERE team_id = %(team_id)s GROUP BY person_id, team_id, cohort_id HAVING sum(sign) > 0", + {"team_id": self.team.pk}, + ) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0][0], p2.uuid) + + def test_static_cohort_precalculated(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["1"]) + Person.objects.create(team_id=self.team.pk, distinct_ids=["123"]) + Person.objects.create(team_id=self.team.pk, distinct_ids=["2"]) + # Team leakage + team2 = Team.objects.create(organization=self.organization) + Person.objects.create(team=team2, distinct_ids=["1"]) + + cohort = Cohort.objects.create(team=self.team, groups=[], is_static=True, last_calculation=timezone.now(),) + cohort.insert_users_by_list(["1", "123"]) + + with freeze_time("2020-01-10"): + cohort.calculate_people_ch() + + with self.settings(USE_PRECALCULATED_CH_COHORT_PEOPLE=True): + sql, _ = format_filter_query(cohort) + self.assertEqual( + sqlparse.format(sql, reindent=True), + sqlparse.format( + """ + SELECT distinct_id + FROM + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = %(team_id)s + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) + WHERE person_id IN + (SELECT person_id + FROM person_static_cohort + WHERE cohort_id = %(_cohort_id_0)s + AND team_id = %(team_id)s) + """, + reindent=True, + ), + ) + + def test_cohortpeople_with_valid_other_cohort_filter(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["1"], properties={"foo": "bar"},) + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["2"], properties={"foo": "non"},) + + cohort0: Cohort = Cohort.objects.create( + team=self.team, groups=[{"properties": {"foo": "bar"}}], name="cohort0", + ) + cohort0.calculate_people_ch() + + cohort1: Cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": [{"key": "id", "type": "cohort", "value": cohort0.id}]}], + name="cohort1", + ) + + cohort1.calculate_people_ch() + + count_result = sync_execute( + "SELECT count(person_id) FROM cohortpeople where cohort_id = %(cohort_id)s", {"cohort_id": cohort1.pk} + )[0][0] + self.assertEqual(count_result, 1) + + def test_cohortpeople_with_nonexistent_other_cohort_filter(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["1"], properties={"foo": "bar"},) + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["2"], properties={"foo": "non"},) + + cohort1: Cohort = Cohort.objects.create( + team=self.team, groups=[{"properties": [{"key": "id", "type": "cohort", "value": 666}]}], name="cohort1", + ) + + cohort1.calculate_people_ch() + + count_result = sync_execute( + "SELECT count(person_id) FROM cohortpeople where cohort_id = %(cohort_id)s", {"cohort_id": cohort1.pk} + )[0][0] + self.assertEqual(count_result, 0) + + def test_cohortpeople_with_cyclic_cohort_filter(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["1"], properties={"foo": "bar"},) + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["2"], properties={"foo": "non"},) + + cohort1: Cohort = Cohort.objects.create( + team=self.team, groups=[], name="cohort1", + ) + cohort1.groups = [{"properties": [{"key": "id", "type": "cohort", "value": cohort1.id}]}] + cohort1.save() + + cohort1.calculate_people_ch() + + count_result = sync_execute( + "SELECT count(person_id) FROM cohortpeople where cohort_id = %(cohort_id)s", {"cohort_id": cohort1.pk} + )[0][0] + self.assertEqual(count_result, 2) + + def test_clickhouse_empty_query(self): + cohort2 = Cohort.objects.create( + team=self.team, groups=[{"properties": {"$some_prop": "nomatchihope"}}], name="cohort1", + ) + + cohort2.calculate_people() + self.assertFalse(Cohort.objects.get().is_calculating) diff --git a/ee/clickhouse/models/test/test_dead_letter_queue.py b/ee/clickhouse/models/test/test_dead_letter_queue.py new file mode 100644 index 0000000000000..257fbf1f11fec --- /dev/null +++ b/ee/clickhouse/models/test/test_dead_letter_queue.py @@ -0,0 +1,118 @@ +import json +from datetime import datetime +from uuid import UUID, uuid4 + +from kafka import KafkaProducer + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.test.utils.util import delay_until_clickhouse_consumes_from_kafka +from ee.clickhouse.sql.dead_letter_queue import DEAD_LETTER_QUEUE_TABLE, INSERT_DEAD_LETTER_QUEUE_EVENT_SQL +from ee.clickhouse.util import ClickhouseTestMixin +from ee.kafka_client.topics import KAFKA_DEAD_LETTER_QUEUE +from posthog.settings import KAFKA_HOSTS +from posthog.test.base import BaseTest + +TEST_EVENT_RAW_PAYLOAD = json.dumps( + {"event": "some event", "properties": {"distinct_id": 2, "token": "invalid token",},} +) + +CREATED_AT = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") +ERROR_TIMESTAMP = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") +NOW = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") + + +TEST_DATA = { + "id": str(uuid4()), + "event_uuid": str(uuid4()), + "event": "some event", + "properties": "{ a: 1 }", + "distinct_id": "some distinct id", + "team_id": 1, + "elements_chain": "", + "created_at": CREATED_AT, + "ip": "127.0.0.1", + "site_url": "https://myawesomewebsite.com", + "now": NOW, + "raw_payload": TEST_EVENT_RAW_PAYLOAD, + "error_timestamp": ERROR_TIMESTAMP, + "error_location": "plugin-server", + "error": "createPerson failed", +} + + +def reset_tables(): + sync_execute("TRUNCATE TABLE events_dead_letter_queue") + + # can't truncate table with kafka engine, reading from it will delete the rows + sync_execute("SELECT * FROM kafka_events_dead_letter_queue") + + +class TestDeadLetterQueue(ClickhouseTestMixin, BaseTest): + def setUp(self): + super().setUp() + reset_tables() + + def test_direct_table_insert(self): + + sync_execute( + INSERT_DEAD_LETTER_QUEUE_EVENT_SQL, TEST_DATA, + ) + + dead_letter_queue_events = sync_execute("SELECT * FROM events_dead_letter_queue LIMIT 1") + + dlq_event = dead_letter_queue_events[0] + + self.assertEqual(type(dlq_event[0]), UUID) # id + self.assertEqual(type(dlq_event[1]), UUID) # event_uuid + self.assertEqual(dlq_event[2], "some event") # event + self.assertEqual(dlq_event[3], "{ a: 1 }") # properties + self.assertEqual(dlq_event[4], "some distinct id") # distinct_id + self.assertEqual(dlq_event[5], 1) # team_id + self.assertEqual(dlq_event[6], "") # elements_chain + self.assertEqual(dlq_event[7].strftime("%Y-%m-%d %H:%M:%S.%f"), CREATED_AT) # created_at + self.assertEqual(dlq_event[8], "127.0.0.1") # ip + self.assertEqual(dlq_event[9], "https://myawesomewebsite.com") # site_url + self.assertEqual(dlq_event[10].strftime("%Y-%m-%d %H:%M:%S.%f"), NOW) # now + self.assertEqual(dlq_event[11], TEST_EVENT_RAW_PAYLOAD) # raw_payload + self.assertEqual(dlq_event[12].strftime("%Y-%m-%d %H:%M:%S.%f"), ERROR_TIMESTAMP) # created_at + self.assertEqual(dlq_event[13], "plugin-server") # error_location + self.assertEqual(dlq_event[14], "createPerson failed") # error + + def test_kafka_insert(self): + + kafka_data = TEST_DATA + + new_id = str(uuid4()) + kafka_data["id"] = new_id + + new_event_uuid = str(uuid4()) + kafka_data["event_uuid"] = new_event_uuid + + new_error = "cannot reach db to fetch team" + kafka_data["error"] = new_error + + kafka_producer = KafkaProducer(bootstrap_servers=KAFKA_HOSTS) + + kafka_producer.send(topic=KAFKA_DEAD_LETTER_QUEUE, value=json.dumps(kafka_data).encode("utf-8")) + + delay_until_clickhouse_consumes_from_kafka(DEAD_LETTER_QUEUE_TABLE, 1) + + dead_letter_queue_events = sync_execute(f"SELECT * FROM {DEAD_LETTER_QUEUE_TABLE} LIMIT 1") + + dlq_event = dead_letter_queue_events[0] + + self.assertEqual(str(dlq_event[0]), new_id) # id + self.assertEqual(str(dlq_event[1]), new_event_uuid) # event_uuid + self.assertEqual(dlq_event[2], "some event") # event + self.assertEqual(dlq_event[3], "{ a: 1 }") # properties + self.assertEqual(dlq_event[4], "some distinct id") # distinct_id + self.assertEqual(dlq_event[5], 1) # team_id + self.assertEqual(dlq_event[6], "") # elements_chain + self.assertEqual(dlq_event[7].strftime("%Y-%m-%d %H:%M:%S.%f"), CREATED_AT) # created_at + self.assertEqual(dlq_event[8], "127.0.0.1") # ip + self.assertEqual(dlq_event[9], "https://myawesomewebsite.com") # site_url + self.assertEqual(dlq_event[10].strftime("%Y-%m-%d %H:%M:%S.%f"), NOW) # now + self.assertEqual(dlq_event[11], TEST_EVENT_RAW_PAYLOAD) # raw_payload + self.assertEqual(dlq_event[12].strftime("%Y-%m-%d %H:%M:%S.%f"), ERROR_TIMESTAMP) # created_at + self.assertEqual(dlq_event[13], "plugin-server") # error_location + self.assertEqual(dlq_event[14], new_error) # error diff --git a/ee/clickhouse/models/test/test_element.py b/ee/clickhouse/models/test/test_element.py index 835171415878d..4aba8f34a9acc 100644 --- a/ee/clickhouse/models/test/test_element.py +++ b/ee/clickhouse/models/test/test_element.py @@ -1,9 +1,9 @@ from ee.clickhouse.client import sync_execute from ee.clickhouse.models.element import chain_to_elements, elements_to_string from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.base import BaseTest from posthog.models import Element from posthog.models.utils import UUIDT +from posthog.test.base import BaseTest class TestClickhouseElement(ClickhouseTestMixin, BaseTest): diff --git a/ee/clickhouse/models/test/test_filters.py b/ee/clickhouse/models/test/test_filters.py index a7634b49c4c9a..696e866d28c8f 100644 --- a/ee/clickhouse/models/test/test_filters.py +++ b/ee/clickhouse/models/test/test_filters.py @@ -3,16 +3,17 @@ from ee.clickhouse.client import sync_execute from ee.clickhouse.models.event import ClickhouseEventSerializer, create_event -from ee.clickhouse.models.person import ClickhousePersonSerializer from ee.clickhouse.models.property import parse_prop_clauses from ee.clickhouse.sql.events import GET_EVENTS_WITH_PROPERTIES from ee.clickhouse.util import ClickhouseTestMixin from posthog.models.cohort import Cohort from posthog.models.event import Event -from posthog.models.filter import Filter +from posthog.models.filters import Filter +from posthog.models.filters.retention_filter import RetentionFilter +from posthog.models.filters.test.test_filter import TestFilter as PGTestFilters +from posthog.models.filters.test.test_filter import property_to_Q_test_factory from posthog.models.person import Person from posthog.models.team import Team -from posthog.test.test_filter_model import property_to_Q_test_factory def _filter_events( @@ -46,7 +47,161 @@ def _create_event(**kwargs): return Event(id=str(uuid)) -class TestClickhouseFiltering( +class TestFilters(PGTestFilters): + def test_simplify_cohorts(self): + cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"}]}], + ) + cohort.calculate_people_ch() + + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": cohort.pk}]}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"},]}, + ) + + with self.settings(USE_PRECALCULATED_CH_COHORT_PEOPLE=True): + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"type": "precalculated-cohort", "key": "id", "value": cohort.pk, "operator": None},]}, + ) + + def test_simplify_not_ee(self): + cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"}]}], + ) + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": cohort.pk}]}) + + self.assertEqual( + filter.simplify(self.team, is_clickhouse_enabled=False).properties_to_dict(), + {"properties": [{"type": "cohort", "key": "id", "value": cohort.pk, "operator": None}]}, + ) + + def test_simplify_static_cohort(self): + cohort = Cohort.objects.create(team=self.team, groups=[], is_static=True) + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": cohort.pk}]}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"type": "static-cohort", "key": "id", "value": cohort.pk, "operator": None},]}, + ) + + def test_simplify_hasdone_cohort(self): + cohort = Cohort.objects.create(team=self.team, groups=[{"event_id": "$pageview", "days": 1}]) + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": cohort.pk}]}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"type": "cohort", "key": "id", "value": cohort.pk, "operator": None}]}, + ) + + def test_simplify_multi_group_cohort(self): + cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": {"$some_prop": "something"}}, {"properties": {"$another_prop": "something"}}], + ) + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": cohort.pk}]}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"type": "cohort", "key": "id", "value": cohort.pk, "operator": None}]}, + ) + + def test_recursive_cohort(self): + cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"}]}], + ) + recursive_cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": [{"type": "cohort", "key": "id", "value": cohort.pk, "operator": None}]}], + ) + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": recursive_cohort.pk}]}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"},]}, + ) + + def test_simplify_no_such_cohort(self): + filter = Filter(data={"properties": [{"type": "cohort", "key": "id", "value": 555_555}]}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"type": "cohort", "key": "id", "value": 555_555, "operator": None}]}, + ) + + def test_simplify_entities(self): + cohort = Cohort.objects.create( + team=self.team, + groups=[{"properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"}]}], + ) + filter = Filter( + data={"events": [{"id": "$pageview", "properties": [{"type": "cohort", "key": "id", "value": cohort.pk}]}]} + ) + + self.assertEqual( + filter.simplify(self.team).entities_to_dict(), + { + "events": [ + { + "type": "events", + "id": "$pageview", + "math": None, + "math_property": None, + "math_group_type_index": None, + "custom_name": None, + "order": None, + "name": "$pageview", + "properties": [{"key": "email", "operator": "icontains", "value": ".com", "type": "person"},], + } + ], + }, + ) + + def test_simplify_entities_with_group_math(self): + filter = Filter(data={"events": [{"id": "$pageview", "math": "unique_group", "math_group_type_index": 2}]}) + + self.assertEqual( + filter.simplify(self.team).entities_to_dict(), + { + "events": [ + { + "type": "events", + "id": "$pageview", + "math": "unique_group", + "math_property": None, + "math_group_type_index": 2, + "custom_name": None, + "order": None, + "name": "$pageview", + "properties": [{"key": "$group_2", "operator": "is_not", "value": "", "type": "event"},], + } + ], + }, + ) + + def test_simplify_when_aggregating_by_group(self): + filter = RetentionFilter(data={"aggregation_group_type_index": 0}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"key": "$group_0", "operator": "is_not", "value": "", "type": "event"}]}, + ) + + def test_simplify_funnel_entities_when_aggregating_by_group(self): + filter = Filter(data={"events": [{"id": "$pageview"}], "aggregation_group_type_index": 2}) + + self.assertEqual( + filter.simplify(self.team).properties_to_dict(), + {"properties": [{"key": "$group_2", "operator": "is_not", "value": "", "type": "event"}]}, + ) + + +class TestFiltering( ClickhouseTestMixin, property_to_Q_test_factory(_filter_events, _create_event, _create_person), # type: ignore ): def test_person_cohort_properties(self): @@ -56,7 +211,9 @@ def test_person_cohort_properties(self): ) cohort1 = Cohort.objects.create( - team=self.team, groups=[{"properties": {"$some_prop": "something"}}], name="cohort1" + team=self.team, + groups=[{"properties": [{"type": "person", "key": "$some_prop", "value": "something"}]}], + name="cohort1", ) person2_distinct_id = "person2" @@ -64,30 +221,38 @@ def test_person_cohort_properties(self): team=self.team, distinct_ids=[person2_distinct_id], properties={"$some_prop": "different"} ) cohort2 = Cohort.objects.create( - team=self.team, groups=[{"properties": {"$some_prop__is_not": "something"}}], name="cohort2" + team=self.team, + groups=[ + {"properties": [{"type": "person", "key": "$some_prop", "value": "something", "operator": "is_not"}]} + ], + name="cohort2", ) - filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}) + filter = Filter(data={"properties": [{"key": "id", "value": cohort1.pk, "type": "cohort"}],}, team=self.team) - prop_clause, prop_clause_params = parse_prop_clauses(filter.properties, self.team.pk) + prop_clause, prop_clause_params = parse_prop_clauses( + filter.properties, self.team.pk, has_person_id_joined=False + ) query = """ - SELECT * FROM person_distinct_id WHERE team_id = %(team_id)s {prop_clause} + SELECT distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s {prop_clause} """.format( prop_clause=prop_clause ) # get distinct_id column of result - result = sync_execute(query, {"team_id": self.team.pk, **prop_clause_params})[0][1] + result = sync_execute(query, {"team_id": self.team.pk, **prop_clause_params})[0][0] self.assertEqual(result, person1_distinct_id) # test cohort2 with negation - filter = Filter(data={"properties": [{"key": "id", "value": cohort2.pk, "type": "cohort"}],}) - prop_clause, prop_clause_params = parse_prop_clauses(filter.properties, self.team.pk) + filter = Filter(data={"properties": [{"key": "id", "value": cohort2.pk, "type": "cohort"}],}, team=self.team) + prop_clause, prop_clause_params = parse_prop_clauses( + filter.properties, self.team.pk, has_person_id_joined=False + ) query = """ - SELECT * FROM person_distinct_id WHERE team_id = %(team_id)s {prop_clause} + SELECT distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s {prop_clause} """.format( prop_clause=prop_clause ) # get distinct_id column of result - result = sync_execute(query, {"team_id": self.team.pk, **prop_clause_params})[0][1] + result = sync_execute(query, {"team_id": self.team.pk, **prop_clause_params})[0][0] self.assertEqual(result, person2_distinct_id) diff --git a/ee/clickhouse/models/test/test_plugin_log_entry.py b/ee/clickhouse/models/test/test_plugin_log_entry.py new file mode 100644 index 0000000000000..4c00d67df0903 --- /dev/null +++ b/ee/clickhouse/models/test/test_plugin_log_entry.py @@ -0,0 +1,37 @@ +from django.utils import timezone + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.sql.plugin_log_entries import INSERT_PLUGIN_LOG_ENTRY_SQL +from posthog.models import PluginLogEntry +from posthog.models.utils import UUIDT +from posthog.test.test_plugin_log_entry import factory_test_plugin_log_entry + + +def plugin_log_factory_ch( + *, + team_id: int, + plugin_id: int, + plugin_config_id: int, + source: PluginLogEntry.Source, + type: PluginLogEntry.Type, + message: str, + instance_id: str +): + sync_execute( + INSERT_PLUGIN_LOG_ENTRY_SQL, + { + "id": UUIDT(), + "team_id": team_id, + "plugin_id": plugin_id, + "plugin_config_id": plugin_config_id, + "source": source, + "type": type, + "instance_id": instance_id, + "message": message, + "timestamp": timezone.now().strftime("%Y-%m-%dT%H:%M:%S.%f"), + }, + ) + + +class TestEvent(factory_test_plugin_log_entry(plugin_log_factory_ch)): # type: ignore + pass diff --git a/ee/clickhouse/models/test/test_property.py b/ee/clickhouse/models/test/test_property.py index 0ca31cc217510..59b86e89dff66 100644 --- a/ee/clickhouse/models/test/test_property.py +++ b/ee/clickhouse/models/test/test_property.py @@ -1,22 +1,28 @@ -from uuid import uuid4 +from typing import List +from uuid import UUID, uuid4 + +import pytest from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns.columns import materialize from ee.clickhouse.models.event import create_event -from ee.clickhouse.models.property import parse_prop_clauses +from ee.clickhouse.models.property import parse_prop_clauses, prop_filter_json_extract +from ee.clickhouse.models.util import PersonPropertiesMode +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.base import BaseTest -from posthog.models.cohort import Cohort -from posthog.models.event import Event -from posthog.models.filter import Filter +from posthog.models.element import Element +from posthog.models.filters import Filter from posthog.models.person import Person -from posthog.models.team import Team +from posthog.models.property import Property +from posthog.test.base import BaseTest -def _create_event(**kwargs) -> Event: +def _create_event(**kwargs) -> UUID: pk = uuid4() kwargs.update({"event_uuid": pk}) create_event(**kwargs) - return Event(pk=str(pk)) + return pk def _create_person(**kwargs) -> Person: @@ -25,6 +31,13 @@ def _create_person(**kwargs) -> Person: class TestPropFormat(ClickhouseTestMixin, BaseTest): + CLASS_DATA_LEVEL_SETUP = False + + def _run_query(self, filter: Filter) -> List: + query, params = parse_prop_clauses(filter.properties, self.team.pk, allow_denormalized_props=True) + final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) + return sync_execute(final_query, {**params, "team_id": self.team.pk}) + def test_prop_person(self): _create_person( @@ -38,14 +51,9 @@ def test_prop_person(self): ) filter = Filter(data={"properties": [{"key": "email", "value": "test@posthog.com", "type": "person"}],}) - query, params = parse_prop_clauses(filter.properties, self.team.pk) - - final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) - result = sync_execute(final_query, {**params, "team_id": self.team.pk}) - self.assertEqual(len(result), 1) + self.assertEqual(len(self._run_query(filter)), 1) def test_prop_event(self): - _create_event( event="$pageview", team=self.team, distinct_id="whatever", properties={"attr": "some_other_val"}, ) @@ -54,9 +62,448 @@ def test_prop_event(self): event="$pageview", team=self.team, distinct_id="whatever", properties={"attr": "some_val"}, ) - filter = Filter(data={"properties": [{"key": "attr", "value": "some_val"}],}) - query, params = parse_prop_clauses(filter.properties, self.team.pk) - final_query = "SELECT uuid FROM events WHERE team_id = %(team_id)s {}".format(query) + filter_exact = Filter(data={"properties": [{"key": "attr", "value": "some_val"}],}) + self.assertEqual(len(self._run_query(filter_exact)), 1) + + filter_regex = Filter(data={"properties": [{"key": "attr", "value": "some_.+_val", "operator": "regex"}],}) + self.assertEqual(len(self._run_query(filter_regex)), 1) + + filter_icontains = Filter(data={"properties": [{"key": "attr", "value": "Some_Val", "operator": "icontains"}],}) + self.assertEqual(len(self._run_query(filter_icontains)), 1) + + filter_not_icontains = Filter( + data={"properties": [{"key": "attr", "value": "other", "operator": "not_icontains"}],} + ) + self.assertEqual(len(self._run_query(filter_not_icontains)), 1) + + def test_prop_element(self): + _create_event( + event="$autocapture", + team=self.team, + distinct_id="whatever", + properties={"attr": "some_other_val"}, + elements=[ + Element(tag_name="a", href="/a-url", attr_class=["small"], text="bla bla", nth_child=1, nth_of_type=0,), + Element(tag_name="button", attr_class=["btn", "btn-primary"], nth_child=0, nth_of_type=0), + Element(tag_name="div", nth_child=0, nth_of_type=0), + Element(tag_name="label", nth_child=0, nth_of_type=0, attr_id="nested",), + ], + ) + _create_event( + event="$autocapture", + team=self.team, + distinct_id="whatever", + properties={"attr": "some_val"}, + elements=[ + Element( + tag_name="a", + href="/a-url", + attr_class=["small"], + text='bla"bla', + attributes={}, + nth_child=1, + nth_of_type=0, + ), + Element(tag_name="button", attr_class=["btn", "btn-secondary"], nth_child=0, nth_of_type=0), + Element(tag_name="div", nth_child=0, nth_of_type=0), + Element(tag_name="img", nth_child=0, nth_of_type=0, attr_id="nested",), + ], + ) + _create_event( + event="$autocapture", + team=self.team, + distinct_id="whatever", + elements=[ + Element(tag_name="a", href="/789", nth_child=0, nth_of_type=0,), + Element(tag_name="button", attr_class=["btn", "btn-tertiary"], nth_child=0, nth_of_type=0), + ], + ) + + # selector + + filter = Filter( + data={"properties": [{"key": "selector", "value": [".btn"], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter)), 3) + + filter = Filter( + data={"properties": [{"key": "selector", "value": ".btn", "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter)), 3) + + filter = Filter( + data={ + "properties": [{"key": "selector", "value": [".btn-primary"], "operator": "exact", "type": "element"}] + } + ) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter( + data={ + "properties": [{"key": "selector", "value": [".btn-secondary"], "operator": "exact", "type": "element"}] + } + ) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter( + data={ + "properties": [ + { + "key": "selector", + "value": [".btn-primary", ".btn-secondary"], + "operator": "exact", + "type": "element", + } + ] + } + ) + self.assertEqual(len(self._run_query(filter)), 2) + + filter_selector_exact_empty = Filter( + data={"properties": [{"key": "selector", "value": [], "operator": "exact", "type": "element",}]} + ) + self.assertEqual(len(self._run_query(filter_selector_exact_empty)), 0) + + filter_selector_is_not_empty = Filter( + data={"properties": [{"key": "selector", "value": [], "operator": "is_not", "type": "element",}]} + ) + self.assertEqual(len(self._run_query(filter_selector_is_not_empty)), 3) + + # tag_name + + filter = Filter( + data={"properties": [{"key": "tag_name", "value": ["div"], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter)), 2) + + filter = Filter( + data={"properties": [{"key": "tag_name", "value": "div", "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter)), 2) + + filter = Filter( + data={"properties": [{"key": "tag_name", "value": ["img"], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter( + data={"properties": [{"key": "tag_name", "value": ["label"], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter( + data={ + "properties": [{"key": "tag_name", "value": ["img", "label"], "operator": "exact", "type": "element"}] + } + ) + self.assertEqual(len(self._run_query(filter)), 2) + + # href/text + + filter_href_exact = Filter( + data={"properties": [{"key": "href", "value": ["/a-url"], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_exact)), 2) + + filter_href_exact_double = Filter( + data={"properties": [{"key": "href", "value": ["/a-url", "/789"], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_exact_double)), 3) + + filter_href_exact_empty = Filter( + data={"properties": [{"key": "href", "value": [], "operator": "exact", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_exact_empty)), 0) + + filter_href_is_not = Filter( + data={"properties": [{"key": "href", "value": ["/a-url"], "operator": "is_not", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_is_not)), 1) + + filter_href_is_not_double = Filter( + data={"properties": [{"key": "href", "value": ["/a-url", "/789"], "operator": "is_not", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_is_not_double)), 0) + + filter_href_is_not_empty = Filter( + data={"properties": [{"key": "href", "value": [], "operator": "is_not", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_is_not_empty)), 3) + + filter_href_exact_with_tag_name_is_not = Filter( + data={ + "properties": [ + {"key": "href", "value": ["/a-url"], "type": "element"}, + {"key": "tag_name", "value": ["marquee"], "operator": "is_not", "type": "element"}, + ] + } + ) + self.assertEqual(len(self._run_query(filter_href_exact_with_tag_name_is_not)), 2) + + filter_href_icontains = Filter( + data={"properties": [{"key": "href", "value": ["UrL"], "operator": "icontains", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_icontains)), 2) + + filter_href_regex = Filter( + data={"properties": [{"key": "href", "value": "/a-.+", "operator": "regex", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_regex)), 2) + + filter_href_not_regex = Filter( + data={"properties": [{"key": "href", "value": r"/\d+", "operator": "not_regex", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_href_not_regex)), 2) + + filter_text_icontains_with_doublequote = Filter( + data={"properties": [{"key": "text", "value": 'bla"bla', "operator": "icontains", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_text_icontains_with_doublequote)), 1) + + filter_text_is_set = Filter( + data={"properties": [{"key": "text", "value": "is_set", "operator": "is_set", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_text_is_set)), 2) + + filter_text_is_not_set = Filter( + data={"properties": [{"key": "text", "value": "is_not_set", "operator": "is_not_set", "type": "element"}]} + ) + self.assertEqual(len(self._run_query(filter_text_is_not_set)), 1) + + def test_prop_ints_saved_as_strings(self): + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "0"}, + ) + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "2"}, + ) + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 2}, + ) + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "string"}, + ) + filter = Filter(data={"properties": [{"key": "test_prop", "value": "2"}],}) + self.assertEqual(len(self._run_query(filter)), 2) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 2}],}) + self.assertEqual(len(self._run_query(filter)), 2) + + # value passed as string + filter = Filter(data={"properties": [{"key": "test_prop", "value": "1", "operator": "gt"}],}) + self.assertEqual(len(self._run_query(filter)), 2) + filter = Filter(data={"properties": [{"key": "test_prop", "value": "3", "operator": "lt"}],}) + self.assertEqual(len(self._run_query(filter)), 3) + + # value passed as int + filter = Filter(data={"properties": [{"key": "test_prop", "value": 1, "operator": "gt"}],}) + self.assertEqual(len(self._run_query(filter)), 2) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 3, "operator": "lt"}],}) + self.assertEqual(len(self._run_query(filter)), 3) + + def test_prop_decimals(self): + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 1.4}, + ) + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 1.3}, + ) + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 2}, + ) + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 2.5}, + ) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 1.5}],}) + self.assertEqual(len(self._run_query(filter)), 0) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 1.2, "operator": "gt"}],}) + self.assertEqual(len(self._run_query(filter)), 4) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "1.2", "operator": "gt"}],}) + self.assertEqual(len(self._run_query(filter)), 4) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 2.3, "operator": "lt"}],}) + self.assertEqual(len(self._run_query(filter)), 3) + + +class TestPropDenormalized(ClickhouseTestMixin, BaseTest): + CLASS_DATA_LEVEL_SETUP = False + + def _run_query(self, filter: Filter, join_person_tables=False) -> List: + query, params = parse_prop_clauses( + filter.properties, + self.team.pk, + allow_denormalized_props=True, + person_properties_mode=PersonPropertiesMode.EXCLUDE, + ) + joins = "" + if join_person_tables: + person_query = ClickhousePersonQuery(filter, self.team.pk) + person_subquery, person_join_params = person_query.get_query() + joins = f""" + INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id + """ + params.update(person_join_params) + + final_query = f"SELECT uuid FROM events {joins} WHERE team_id = %(team_id)s {query}" + # Make sure we don't accidentally use json on the properties field + self.assertNotIn("json", final_query.lower()) + return sync_execute(final_query, {**params, "team_id": self.team.pk}) + + def test_prop_event_denormalized(self): + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "some_other_val"}, + ) + + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": "some_val"}, + ) + + materialize("events", "test_prop") + materialize("events", "something_else") + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "some_val"}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "some_val", "operator": "is_not"}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "some_val", "operator": "is_set"}],}) + self.assertEqual(len(self._run_query(filter)), 2) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "some_val", "operator": "is_not_set"}],}) + self.assertEqual(len(self._run_query(filter)), 0) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "_other_", "operator": "icontains"}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": "_other_", "operator": "not_icontains"}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + def test_prop_person_denormalized(self): + _create_person(distinct_ids=["some_id"], team_id=self.team.pk, properties={"email": "test@posthog.com"}) + _create_event(event="$pageview", team=self.team, distinct_id="some_id") + + materialize("person", "email") + + filter = Filter( + data={"properties": [{"key": "email", "type": "person", "value": "posthog", "operator": "icontains"}],} + ) + self.assertEqual(len(self._run_query(filter, join_person_tables=True)), 1) + + filter = Filter( + data={"properties": [{"key": "email", "type": "person", "value": "posthog", "operator": "not_icontains"}],} + ) + self.assertEqual(len(self._run_query(filter, join_person_tables=True)), 0) + + def test_prop_event_denormalized_ints(self): + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 0}, + ) + + _create_event( + event="$pageview", team=self.team, distinct_id="whatever", properties={"test_prop": 2}, + ) + + materialize("events", "test_prop") + materialize("events", "something_else") + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 1, "operator": "gt"}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 1, "operator": "lt"}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + filter = Filter(data={"properties": [{"key": "test_prop", "value": 0}],}) + self.assertEqual(len(self._run_query(filter)), 1) + + +def test_parse_prop_clauses_defaults(snapshot): + filter = Filter( + data={ + "properties": [ + {"key": "event_prop", "value": "value"}, + {"key": "email", "type": "person", "value": "posthog", "operator": "icontains"}, + ], + } + ) + + assert parse_prop_clauses(filter.properties, None) == snapshot + assert ( + parse_prop_clauses( + filter.properties, None, person_properties_mode=PersonPropertiesMode.USING_PERSON_PROPERTIES_COLUMN + ) + == snapshot + ) + assert parse_prop_clauses(filter.properties, None, person_properties_mode=PersonPropertiesMode.EXCLUDE) == snapshot + + +@pytest.fixture +def test_events(db, team) -> List[UUID]: + return [ + _create_event(event="$pageview", team=team, distinct_id="whatever", properties={"email": "test@posthog.com"},), + _create_event(event="$pageview", team=team, distinct_id="whatever", properties={"email": "mongo@example.com"},), + _create_event(event="$pageview", team=team, distinct_id="whatever", properties={"attr": "some_val"},), + _create_event(event="$pageview", team=team, distinct_id="whatever", properties={"attr": "50"},), + _create_event(event="$pageview", team=team, distinct_id="whatever", properties={"attr": 5},), + ] + + +TEST_PROPERTIES = [ + (Property(key="email", value="test@posthog.com"), [0]), + (Property(key="email", value="test@posthog.com", operator="exact"), [0]), + (Property(key="email", value=["pineapple@pizza.com", "mongo@example.com"], operator="exact"), [1]), + (Property(key="attr", value="5"), [4]), + (Property(key="email", value="test@posthog.com", operator="is_not"), range(1, 5)), + (Property(key="email", value=["test@posthog.com", "mongo@example.com"], operator="is_not"), range(2, 5)), + (Property(key="email", value=r".*est@.*", operator="regex"), [0]), + (Property(key="email", value=r"?.", operator="regex"), []), + (Property(key="email", operator="is_set", value="is_set"), [0, 1]), + (Property(key="email", operator="is_not_set", value="is_not_set"), range(2, 5)), +] + + +@pytest.mark.parametrize("property,expected_event_indexes", TEST_PROPERTIES) +def test_prop_filter_json_extract(test_events, property, expected_event_indexes, team): + query, params = prop_filter_json_extract(property, 0, allow_denormalized_props=False) + uuids = list( + sorted( + [ + uuid + for (uuid,) in sync_execute( + f"SELECT uuid FROM events WHERE team_id = %(team_id)s {query}", {"team_id": team.pk, **params} + ) + ] + ) + ) + expected = list(sorted([test_events[index] for index in expected_event_indexes])) + + assert uuids == expected + + +@pytest.mark.parametrize("property,expected_event_indexes", TEST_PROPERTIES) +def test_prop_filter_json_extract_materialized(test_events, property, expected_event_indexes, team): + materialize("events", "attr") + materialize("events", "email") + + query, params = prop_filter_json_extract(property, 0, allow_denormalized_props=True) + + assert "JSONExtract" not in query + + uuids = list( + sorted( + [ + uuid + for (uuid,) in sync_execute( + f"SELECT uuid FROM events WHERE team_id = %(team_id)s {query}", {"team_id": team.pk, **params} + ) + ] + ) + ) + expected = list(sorted([test_events[index] for index in expected_event_indexes])) - result = sync_execute(final_query, {**params, "team_id": self.team.pk}) - self.assertEqual(len(result), 1) + assert uuids == expected diff --git a/ee/clickhouse/models/test/utils/util.py b/ee/clickhouse/models/test/utils/util.py new file mode 100644 index 0000000000000..a40055c809484 --- /dev/null +++ b/ee/clickhouse/models/test/utils/util.py @@ -0,0 +1,14 @@ +from time import sleep, time + +from ee.clickhouse.client import sync_execute + + +# this normally is unnecessary as CH is fast to consume from Kafka when testing +# but it helps prevent potential flakiness +def delay_until_clickhouse_consumes_from_kafka(table_name: str, target_row_count: int, timeout_seconds=10) -> None: + ts_start = time() + while time() < ts_start + timeout_seconds: + result = sync_execute(f"SELECT COUNT(1) FROM {table_name}") + if result[0][0] == target_row_count: + return + sleep(0.5) diff --git a/ee/clickhouse/models/util.py b/ee/clickhouse/models/util.py index 795489cf7a20b..8158610656eaf 100644 --- a/ee/clickhouse/models/util.py +++ b/ee/clickhouse/models/util.py @@ -1,50 +1,17 @@ import json +from enum import Enum, auto from typing import Optional, Union import pytz from dateutil.parser import isoparse from django.utils import timezone -from posthog.models.property import Property - -def get_operator(prop: Property, arg: str): - operator = prop.operator - - if operator == "is_not": - return "(trim(BOTH '\"' FROM ep.value) = %({})s)".format(arg), prop.value - elif operator == "icontains" or operator == "not_icontains": - value = "%{}%".format(prop.value) - return "(trim(BOTH '\"' FROM ep.value) LIKE %({})s)".format(arg), value - elif operator == "regex" or operator == "not_regex": - return "match(trim(BOTH '\"' FROM ep.value), %({})s)".format(arg), prop.value - elif operator == "is_set": - return "", prop.value - elif operator == "is_not_set": - return "", prop.value - elif operator == "gt": - return ( - "(toInt64(trim(BOTH '\"' FROM ep.value)) > %({})s)".format(arg), - prop.value, - ) - elif operator == "lt": - return ( - "(toInt64(trim(BOTH '\"' FROM ep.value)) < %({})s)".format(arg), - prop.value, - ) - else: - if is_json(prop.value): - return ( - "replaceRegexpAll(trim(BOTH '\"' FROM ep.value),' ', '') = replaceRegexpAll(toString(%({})s),' ', '')".format( - arg - ), - prop.value, - ) - else: - return ( - "(trim(BOTH '\"' FROM ep.value) = toString(%({})s))".format(arg), - prop.value, - ) +class PersonPropertiesMode(Enum): + USING_SUBQUERY = auto() + USING_PERSON_PROPERTIES_COLUMN = auto() + # Used when person join handles these filters + EXCLUDE = auto() def is_json(val): @@ -52,19 +19,15 @@ def is_json(val): return False try: - json.loads(val) - except ValueError: + int(val) return False - return True - - -def is_int(value: Optional[Union[str, int]]) -> bool: + except: + pass try: - int(value) # type: ignore + json.loads(val) except (ValueError, TypeError): return False - else: - return True + return True def cast_timestamp_or_now(timestamp: Optional[Union[timezone.datetime, str]]) -> str: diff --git a/ee/clickhouse/process_event.py b/ee/clickhouse/process_event.py deleted file mode 100644 index 710ef97936e26..0000000000000 --- a/ee/clickhouse/process_event.py +++ /dev/null @@ -1,187 +0,0 @@ -import datetime -import json -from typing import Dict, Optional -from uuid import UUID - -import statsd -from celery import shared_task -from dateutil import parser -from dateutil.relativedelta import relativedelta -from django.conf import settings -from django.db.utils import IntegrityError -from sentry_sdk import capture_exception - -from ee.clickhouse.models.event import create_event -from ee.clickhouse.models.session_recording_event import create_session_recording_event -from ee.kafka_client.client import KafkaProducer -from ee.kafka_client.topics import KAFKA_EVENTS_WAL -from posthog.ee import is_ee_enabled -from posthog.models.element import Element -from posthog.models.person import Person -from posthog.models.team import Team -from posthog.models.utils import UUIDT -from posthog.tasks.process_event import handle_identify_or_alias, store_names_and_properties - -if settings.STATSD_HOST is not None: - statsd.Connection.set_defaults(host=settings.STATSD_HOST, port=settings.STATSD_PORT) - - -def _capture_ee( - event_uuid: UUID, - person_uuid: UUID, - ip: str, - site_url: str, - team_id: int, - event: str, - distinct_id: str, - properties: Dict, - timestamp: datetime.datetime, -) -> None: - elements = properties.get("$elements") - elements_list = [] - if elements: - del properties["$elements"] - elements_list = [ - Element( - text=el["$el_text"][0:400] if el.get("$el_text") else None, - tag_name=el["tag_name"], - href=el["attr__href"][0:2048] if el.get("attr__href") else None, - attr_class=el["attr__class"].split(" ") if el.get("attr__class") else None, - attr_id=el.get("attr__id"), - nth_child=el.get("nth_child"), - nth_of_type=el.get("nth_of_type"), - attributes={key: value for key, value in el.items() if key.startswith("attr__")}, - ) - for index, el in enumerate(elements) - ] - - team = Team.objects.only("slack_incoming_webhook", "event_names", "event_properties", "anonymize_ips").get( - pk=team_id - ) - - if not team.anonymize_ips and "$ip" not in properties: - properties["$ip"] = ip - - store_names_and_properties(team=team, event=event, properties=properties) - - if not Person.objects.distinct_ids_exist(team_id=team_id, distinct_ids=[str(distinct_id)]): - # Catch race condition where in between getting and creating, - # another request already created this user - try: - Person.objects.create(team_id=team_id, distinct_ids=[str(distinct_id)]) - except IntegrityError: - pass - - # # determine create events - create_event( - event_uuid=event_uuid, - event=event, - properties=properties, - timestamp=timestamp, - team=team, - distinct_id=distinct_id, - elements=elements_list, - site_url=site_url, - ) - - -def handle_timestamp(data: dict, now: datetime.datetime, sent_at: Optional[datetime.datetime]) -> datetime.datetime: - if data.get("timestamp"): - if sent_at: - # sent_at - timestamp == now - x - # x = now + (timestamp - sent_at) - try: - # timestamp and sent_at must both be in the same format: either both with or both without timezones - # otherwise we can't get a diff to add to now - return now + (parser.isoparse(data["timestamp"]) - sent_at) - except TypeError as e: - capture_exception(e) - return parser.isoparse(data["timestamp"]) - now_datetime = now - if data.get("offset"): - return now_datetime - relativedelta(microseconds=data["offset"] * 1000) - return now_datetime - - -if is_ee_enabled(): - - def process_event_ee( - distinct_id: str, - ip: str, - site_url: str, - data: dict, - team_id: int, - now: datetime.datetime, - sent_at: Optional[datetime.datetime], - ) -> None: - timer = statsd.Timer("%s_posthog_cloud" % (settings.STATSD_PREFIX,)) - timer.start() - properties = data.get("properties", {}) - if data.get("$set"): - properties["$set"] = data["$set"] - - person_uuid = UUIDT() - event_uuid = UUIDT() - ts = handle_timestamp(data, now, sent_at) - handle_identify_or_alias(data["event"], properties, distinct_id, team_id) - - if data["event"] == "$snapshot": - create_session_recording_event( - uuid=event_uuid, - team_id=team_id, - distinct_id=distinct_id, - session_id=properties["$session_id"], - snapshot_data=properties["$snapshot_data"], - timestamp=ts, - ) - return - - _capture_ee( - event_uuid=event_uuid, - person_uuid=person_uuid, - ip=ip, - site_url=site_url, - team_id=team_id, - event=data["event"], - distinct_id=distinct_id, - properties=properties, - timestamp=ts, - ) - timer.stop("process_event_ee") - - -else: - - def process_event_ee( - distinct_id: str, - ip: str, - site_url: str, - data: dict, - team_id: int, - now: datetime.datetime, - sent_at: Optional[datetime.datetime], - ) -> None: - # Noop if ee is not enabled - return - - -def log_event( - distinct_id: str, - ip: str, - site_url: str, - data: dict, - team_id: int, - now: datetime.datetime, - sent_at: Optional[datetime.datetime], -) -> None: - data = { - "distinct_id": distinct_id, - "ip": ip, - "site_url": site_url, - "data": json.dumps(data), - "team_id": team_id, - "now": now.isoformat(), - "sent_at": sent_at.isoformat() if sent_at else "", - } - p = KafkaProducer() - p.produce(topic=KAFKA_EVENTS_WAL, data=data) diff --git a/ee/clickhouse/queries/__init__.py b/ee/clickhouse/queries/__init__.py index c2a24810f8116..4815ed5c49231 100644 --- a/ee/clickhouse/queries/__init__.py +++ b/ee/clickhouse/queries/__init__.py @@ -1,2 +1,7 @@ -from .clickhouse_funnel import ClickhouseFunnel +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel + +from .clickhouse_retention import ClickhouseRetention +from .clickhouse_session_recording import SessionRecording +from .clickhouse_stickiness import ClickhouseStickiness +from .paths import ClickhousePaths from .trends.clickhouse_trends import ClickhouseTrends diff --git a/ee/clickhouse/queries/breakdown_props.py b/ee/clickhouse/queries/breakdown_props.py new file mode 100644 index 0000000000000..109a52e9cd021 --- /dev/null +++ b/ee/clickhouse/queries/breakdown_props.py @@ -0,0 +1,153 @@ +from typing import Any, Dict, List, Optional, Tuple, cast + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.cohort import format_filter_query +from ee.clickhouse.models.entity import get_entity_filtering_params +from ee.clickhouse.models.property import get_property_string_expr, parse_prop_clauses +from ee.clickhouse.models.util import PersonPropertiesMode +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from ee.clickhouse.queries.groups_join_query import GroupsJoinQuery +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from ee.clickhouse.queries.util import parse_timestamps +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from ee.clickhouse.sql.trends.top_elements import TOP_ELEMENTS_ARRAY_OF_KEY_SQL +from posthog.models.cohort import Cohort +from posthog.models.entity import Entity +from posthog.models.filters.filter import Filter +from posthog.models.property import TableWithProperties + +ALL_USERS_COHORT_ID = 0 + + +def get_breakdown_prop_values( + filter: Filter, + entity: Entity, + aggregate_operation: str, + team_id: int, + limit: int = 25, + extra_params={}, + column_optimizer: Optional[ColumnOptimizer] = None, +): + "Returns the top N breakdown prop values for event/person breakdown" + + parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter=filter, team_id=team_id) + prop_filters, prop_filter_params = parse_prop_clauses( + filter.properties + entity.properties, + team_id, + table_name="e", + prepend="e_brkdwn", + person_properties_mode=PersonPropertiesMode.EXCLUDE, + allow_denormalized_props=True, + ) + + entity_params, entity_format_params = get_entity_filtering_params(entity, team_id, table_name="e") + + if filter.breakdown_type == "person": + value_expression, _ = get_property_string_expr("person", cast(str, filter.breakdown), "%(key)s", "person_props") + elif filter.breakdown_type == "group": + value_expression, _ = get_property_string_expr( + "groups", cast(str, filter.breakdown), "%(key)s", f"group_properties_{filter.breakdown_group_type_index}" + ) + else: + value_expression, _ = get_property_string_expr("events", cast(str, filter.breakdown), "%(key)s", "properties") + + person_join_clauses = "" + person_join_params: Dict = {} + person_query = ClickhousePersonQuery(filter, team_id, column_optimizer=column_optimizer, entity=entity) + if person_query.is_used: + person_subquery, person_join_params = person_query.get_query() + person_join_clauses = f""" + INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON e.distinct_id = pdi.distinct_id + INNER JOIN ({person_subquery}) person ON pdi.person_id = person.id + """ + + groups_join_condition, groups_join_params = GroupsJoinQuery(filter, team_id, column_optimizer).get_join_query() + + elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format( + value_expression=value_expression, + parsed_date_from=parsed_date_from, + parsed_date_to=parsed_date_to, + prop_filters=prop_filters, + aggregate_operation=aggregate_operation, + person_join_clauses=person_join_clauses, + groups_join_clauses=groups_join_condition, + **entity_format_params, + ) + + return sync_execute( + elements_query, + { + "key": filter.breakdown, + "limit": limit, + "team_id": team_id, + "offset": filter.offset, + **prop_filter_params, + **entity_params, + **person_join_params, + **groups_join_params, + **extra_params, + **date_params, + }, + )[0][0] + + +def _format_all_query(team_id: int, filter: Filter, **kwargs) -> Tuple[str, Dict]: + entity = kwargs.pop("entity", None) + parsed_date_from, parsed_date_to, date_params = parse_timestamps( + filter=filter, team_id=team_id, table="all_events." + ) + + props_to_filter = [*filter.properties] + + if entity and isinstance(entity, Entity): + props_to_filter = [*props_to_filter, *entity.properties] + + prop_filters, prop_filter_params = parse_prop_clauses( + props_to_filter, team_id, prepend="all_cohort_", table_name="all_events" + ) + query = f""" + SELECT DISTINCT distinct_id, {ALL_USERS_COHORT_ID} as value + FROM events all_events + WHERE team_id = {team_id} + {parsed_date_from} + {parsed_date_to} + {prop_filters} + """ + return query, {**date_params, **prop_filter_params} + + +def format_breakdown_cohort_join_query(team_id: int, filter: Filter, **kwargs) -> Tuple[str, List, Dict]: + entity = kwargs.pop("entity", None) + cohorts = ( + Cohort.objects.filter(team_id=team_id, pk__in=[b for b in filter.breakdown if b != "all"]) + if isinstance(filter.breakdown, list) + else Cohort.objects.filter(team_id=team_id, pk=filter.breakdown) + ) + cohort_queries, params = _parse_breakdown_cohorts(list(cohorts)) + ids = [cohort.pk for cohort in cohorts] + if isinstance(filter.breakdown, list) and "all" in filter.breakdown: + all_query, all_params = _format_all_query(team_id, filter, entity=entity) + cohort_queries.append(all_query) + params = {**params, **all_params} + ids.append(ALL_USERS_COHORT_ID) + return " UNION ALL ".join(cohort_queries), ids, params + + +def _parse_breakdown_cohorts(cohorts: List[Cohort]) -> Tuple[List[str], Dict]: + queries = [] + params: Dict[str, Any] = {} + for idx, cohort in enumerate(cohorts): + person_id_query, cohort_filter_params = format_filter_query(cohort, idx) + params = {**params, **cohort_filter_params} + cohort_query = person_id_query.replace( + "SELECT distinct_id", f"SELECT distinct_id, {cohort.pk} as value", 1 + ) # only replace the first top level occurrence + queries.append(cohort_query) + return queries, params + + +def get_breakdown_cohort_name(cohort_id: int) -> str: + if cohort_id == ALL_USERS_COHORT_ID: + return "all users" + else: + return Cohort.objects.get(pk=cohort_id).name diff --git a/ee/clickhouse/queries/clickhouse_funnel.py b/ee/clickhouse/queries/clickhouse_funnel.py deleted file mode 100644 index ab501b5bce92e..0000000000000 --- a/ee/clickhouse/queries/clickhouse_funnel.py +++ /dev/null @@ -1,95 +0,0 @@ -import uuid -from collections import defaultdict -from typing import Any, Dict, List, Match, Tuple - -from django.utils import timezone - -from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.action import format_action_filter -from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.util import parse_timestamps -from ee.clickhouse.sql.funnels.funnel import FUNNEL_SQL -from posthog.constants import TREND_FILTER_TYPE_ACTIONS -from posthog.models.action import Action -from posthog.models.entity import Entity -from posthog.models.filter import Filter -from posthog.models.person import Person -from posthog.models.team import Team -from posthog.queries.funnel import Funnel -from posthog.utils import relative_date_parse - - -class ClickhouseFunnel(Funnel): - _filter: Filter - _team: Team - - def __init__(self, filter: Filter, team: Team) -> None: - self._filter = filter - self._team = team - - def _build_filters(self, entity: Entity, index: int) -> str: - prop_filters, prop_filter_params = parse_prop_clauses(entity.properties, self._team.pk, prepend=str(index)) - self.params.update(prop_filter_params) - if entity.properties: - return prop_filters - return "" - - def _build_steps_query(self, entity: Entity, index: int) -> str: - filters = self._build_filters(entity, index) - if entity.type == TREND_FILTER_TYPE_ACTIONS: - action = Action.objects.get(pk=entity.id) - for action_step in action.steps.all(): - self.params["events"].append(action_step.event) - action_query, action_params = format_action_filter(action, "step_{}".format(index)) - if action_query == "": - return "" - - self.params.update(action_params) - content_sql = "{actions_query} {filters}".format(actions_query=action_query, filters=filters,) - else: - self.params["events"].append(entity.id) - content_sql = "event = '{event}' {filters}".format(event=entity.id, filters=filters) - return content_sql - - def _exec_query(self) -> List[Tuple]: - prop_filters, prop_filter_params = parse_prop_clauses(self._filter.properties, self._team.pk, prepend="global") - - # format default dates - if not self._filter._date_from: - self._filter._date_from = relative_date_parse("-7d") - if not self._filter._date_to: - self._filter._date_to = timezone.now() - - parsed_date_from, parsed_date_to = parse_timestamps(filter=self._filter, table="events.") - self.params: Dict = { - "team_id": self._team.pk, - "events": [], # purely a speed optimization, don't need this for filtering - **prop_filter_params, - } - steps = [self._build_steps_query(entity, index) for index, entity in enumerate(self._filter.entities)] - query = FUNNEL_SQL.format( - team_id=self._team.id, - steps=", ".join(steps), - filters=prop_filters.replace("uuid IN", "events.uuid IN", 1), - parsed_date_from=parsed_date_from, - parsed_date_to=parsed_date_to, - ) - return sync_execute(query, self.params) - - def run(self, *args, **kwargs) -> List[Dict[str, Any]]: - # Format of this is [step order, person count (that reached that step), array of person uuids] - results = self._exec_query() - - steps = [] - relevant_people = [] - total_people = 0 - - for step in reversed(self._filter.entities): - # Clickhouse step order starts at one, hence the +1 - result_step = [x for x in results if step.order + 1 == x[0]] # type: ignore - if len(result_step) > 0: - total_people += result_step[0][1] - relevant_people += result_step[0][2] - steps.append(self._serialize_step(step, total_people, relevant_people[0:100])) - - return steps[::-1] #  reverse diff --git a/ee/clickhouse/queries/clickhouse_paths.py b/ee/clickhouse/queries/clickhouse_paths.py deleted file mode 100644 index aa849a15ac1be..0000000000000 --- a/ee/clickhouse/queries/clickhouse_paths.py +++ /dev/null @@ -1,98 +0,0 @@ -from typing import Dict, List, Optional - -from django.utils import timezone - -from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.util import parse_timestamps -from ee.clickhouse.sql.events import EXTRACT_TAG_REGEX, EXTRACT_TEXT_REGEX -from ee.clickhouse.sql.paths.path import PATHS_QUERY_FINAL -from posthog.constants import AUTOCAPTURE_EVENT, CUSTOM_EVENT, SCREEN_EVENT -from posthog.models.filter import Filter -from posthog.models.team import Team -from posthog.queries.paths import Paths -from posthog.utils import relative_date_parse - - -class ClickhousePaths(Paths): - def _determine_path_type(self, requested_type=None): - # Default - event: Optional[str] = "$pageview" - path_type = "JSONExtractString(properties, '$current_url')" - start_comparator = "path_type" - - # determine requested type - if requested_type: - if requested_type == SCREEN_EVENT: - event = SCREEN_EVENT - path_type = "JSONExtractString(properties, '$screen_name')" - elif requested_type == AUTOCAPTURE_EVENT: - event = AUTOCAPTURE_EVENT - path_type = "concat('<', {tag_regex}, '> ', {text_regex})".format( - tag_regex=EXTRACT_TAG_REGEX, text_regex=EXTRACT_TEXT_REGEX - ) - start_comparator = "elements_chain" - elif requested_type == CUSTOM_EVENT: - event = None - path_type = "event" - return event, path_type, start_comparator - - def calculate_paths(self, filter: Filter, team: Team): - - # format default dates - if not filter._date_from: - filter._date_from = relative_date_parse("-7d") - if not filter._date_to: - filter._date_to = timezone.now() - - parsed_date_from, parsed_date_to = parse_timestamps(filter=filter) - event, path_type, start_comparator = self._determine_path_type(filter.path_type if filter else None) - - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) - - # Step 0. Event culling subexpression for step 1. - # Make an expression that removes events in a session that are definitely unused. - # For example the 4th, 5th, etc row after a "new_session = 1" or "marked_session_start = 1" row gets removed - excess_row_filter = "(" - for i in range(4): - if i > 0: - excess_row_filter += " or " - excess_row_filter += "neighbor(new_session, {}, 0) = 1".format(-i) - if filter and filter.start_point: - excess_row_filter += " or neighbor(marked_session_start, {}, 0) = 1".format(-i) - excess_row_filter += ")" - - paths_query = PATHS_QUERY_FINAL.format( - event_query="event = %(event)s" - if event - else "event NOT IN ('$autocapture', '$pageview', '$identify', '$pageleave', '$screen')", - path_type=path_type, - parsed_date_from=parsed_date_from, - parsed_date_to=parsed_date_to, - filters=prop_filters, - marked_session_start="{} = %(start_point)s".format(start_comparator) - if filter and filter.start_point - else "new_session", - excess_row_filter=excess_row_filter, - select_elements_chain=", events.elements_chain as elements_chain" if event == AUTOCAPTURE_EVENT else "", - group_by_elements_chain=", events.elements_chain" if event == AUTOCAPTURE_EVENT else "", - ) - - params: Dict = { - "team_id": team.pk, - "property": "$current_url", - "event": event, - "start_point": filter.start_point, - } - params = {**params, **prop_filter_params} - - rows = sync_execute(paths_query, params) - - resp: List[Dict[str, str]] = [] - for row in rows: - resp.append( - {"source": row[0], "source_id": row[1], "target": row[2], "target_id": row[3], "value": row[4],} - ) - - resp = sorted(resp, key=lambda x: x["value"], reverse=True) - return resp diff --git a/ee/clickhouse/queries/clickhouse_retention.py b/ee/clickhouse/queries/clickhouse_retention.py index 5ec32dd361283..357435dc56df8 100644 --- a/ee/clickhouse/queries/clickhouse_retention.py +++ b/ee/clickhouse/queries/clickhouse_retention.py @@ -1,78 +1,208 @@ -import datetime -from typing import Any, Dict, Tuple +from typing import Any, Dict, Iterable, List, Tuple, cast + +from django.db.models.query import Prefetch from ee.clickhouse.client import sync_execute from ee.clickhouse.models.action import format_action_filter +from ee.clickhouse.models.person import get_persons_by_uuids from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.sql.retention.retention import REFERENCE_EVENT_SQL, REFERENCE_EVENT_UNIQUE_SQL, RETENTION_SQL -from posthog.constants import TREND_FILTER_TYPE_ACTIONS, TREND_FILTER_TYPE_EVENTS +from ee.clickhouse.queries.retention.retention_event_query import RetentionEventsQuery +from ee.clickhouse.queries.util import get_trunc_func_ch +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from ee.clickhouse.sql.retention.people_in_period import ( + DEFAULT_REFERENCE_EVENT_PEOPLE_PER_PERIOD_SQL, + DEFAULT_REFERENCE_EVENT_UNIQUE_PEOPLE_PER_PERIOD_SQL, + REFERENCE_EVENT_PEOPLE_PER_PERIOD_SQL, + REFERENCE_EVENT_UNIQUE_PEOPLE_PER_PERIOD_SQL, + RETENTION_PEOPLE_PER_PERIOD_SQL, +) +from ee.clickhouse.sql.retention.retention import ( + INITIAL_INTERVAL_SQL, + REFERENCE_EVENT_SQL, + REFERENCE_EVENT_UNIQUE_SQL, + RETENTION_PEOPLE_SQL, + RETENTION_SQL, +) +from posthog.constants import ( + RETENTION_FIRST_TIME, + TREND_FILTER_TYPE_ACTIONS, + TREND_FILTER_TYPE_EVENTS, + TRENDS_LINEAR, + RetentionQueryType, +) from posthog.models.action import Action from posthog.models.entity import Entity -from posthog.models.filter import Filter +from posthog.models.filters import RetentionFilter +from posthog.models.person import Person from posthog.models.team import Team -from posthog.queries.retention import Retention - -PERIOD_TRUNC_HOUR = "toStartOfHour" -PERIOD_TRUNC_DAY = "toStartOfDay" -PERIOD_TRUNC_WEEK = "toStartOfWeek" -PERIOD_TRUNC_MONTH = "toStartOfMonth" +from posthog.queries.retention import AppearanceRow, Retention class ClickhouseRetention(Retention): - def _execute_sql( - self, - filter: Filter, - date_from: datetime.datetime, - date_to: datetime.datetime, - target_entity: Entity, - returning_entity: Entity, - is_first_time_retention: bool, - team: Team, - ) -> Dict[Tuple[int, int], Dict[str, Any]]: + def _execute_sql(self, filter: RetentionFilter, team: Team,) -> Dict[Tuple[int, int], Dict[str, Any]]: period = filter.period - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) + is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME + date_from = filter.date_from + trunc_func = get_trunc_func_ch(period) + + returning_event_query, returning_event_params = RetentionEventsQuery( + filter=filter, team_id=team.pk, event_query_type=RetentionQueryType.RETURNING + ).get_query() + target_event_query, target_event_params = RetentionEventsQuery( + filter=filter, + team_id=team.pk, + event_query_type=RetentionQueryType.TARGET_FIRST_TIME + if is_first_time_retention + else RetentionQueryType.TARGET, + ).get_query() + + all_params = { + "team_id": team.pk, + "start_date": date_from.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if filter.period == "Hour" else " 00:00:00") + ), + **returning_event_params, + **target_event_params, + "period": period, + } + + result = sync_execute( + RETENTION_SQL.format( + returning_event_query=returning_event_query, + trunc_func=trunc_func, + target_event_query=target_event_query, + ), + all_params, + ) + + initial_interval_result = sync_execute( + INITIAL_INTERVAL_SQL.format(reference_event_sql=target_event_query, trunc_func=trunc_func,), all_params + ) - target_query = "" - target_params: Dict = {} - trunc_func = self._get_trunc_func_ch(period) + result_dict = {} + for initial_res in initial_interval_result: + result_dict.update({(initial_res[0], 0): {"count": initial_res[1], "people": []}}) + for res in result: + result_dict.update({(res[0], res[1]): {"count": res[2], "people": []}}) + + return result_dict + + def _get_condition(self, target_entity: Entity, table: str, prepend: str = "") -> Tuple[str, Dict]: if target_entity.type == TREND_FILTER_TYPE_ACTIONS: action = Action.objects.get(pk=target_entity.id) - action_query, target_params = format_action_filter(action, use_loop=True) - target_query = "AND e.uuid IN ({})".format(action_query) + action_query, params = format_action_filter(action, prepend=prepend, use_loop=False) + condition = action_query elif target_entity.type == TREND_FILTER_TYPE_EVENTS: - target_query = "AND e.event = %(target_event)s" - target_params = {"target_event": target_entity.id} + condition = "{}.event = %({}_event)s".format(table, prepend) + params = {"{}_event".format(prepend): target_entity.id} + else: + condition = "{}.event = %({}_event)s".format(table, prepend) + params = {"{}_event".format(prepend): "$pageview"} + return condition, params + + def _retrieve_people(self, filter: RetentionFilter, team: Team): + period = filter.period + is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME + trunc_func = get_trunc_func_ch(period) + prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) + + returning_entity = filter.returning_entity if filter.selected_interval > 0 else filter.target_entity + target_query, target_params = self._get_condition(filter.target_entity, table="e") + target_query_formatted = "AND {target_query}".format(target_query=target_query) + return_query, return_params = self._get_condition(returning_entity, table="e", prepend="returning") + return_query_formatted = "AND {return_query}".format(return_query=return_query) + + reference_event_query = (REFERENCE_EVENT_UNIQUE_SQL if is_first_time_retention else REFERENCE_EVENT_SQL).format( + target_query=target_query_formatted, + filters=prop_filters, + trunc_func=trunc_func, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ) + reference_date_from = filter.date_from + reference_date_to = filter.date_from + filter.period_increment + date_from = filter.date_from + filter.selected_interval * filter.period_increment + date_to = date_from + filter.period_increment + + result = sync_execute( + RETENTION_PEOPLE_SQL.format( + reference_event_query=reference_event_query, + target_query=return_query_formatted, + filters=prop_filters, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ), + { + "team_id": team.pk, + "start_date": date_from.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if filter.period == "Hour" else " 00:00:00") + ), + "end_date": date_to.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if filter.period == "Hour" else " 00:00:00") + ), + "reference_start_date": reference_date_from.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if filter.period == "Hour" else " 00:00:00") + ), + "reference_end_date": reference_date_to.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if filter.period == "Hour" else " 00:00:00") + ), + "offset": filter.offset, + **target_params, + **return_params, + **prop_filter_params, + }, + ) + people = Person.objects.filter(team_id=team.pk, uuid__in=[val[0] for val in result]) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data + + def _retrieve_people_in_period(self, filter: RetentionFilter, team: Team): + period = filter.period + is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME + trunc_func = get_trunc_func_ch(period) + prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) - target_query, target_params = self._get_condition(target_entity) - returning_query, returning_params = self._get_condition(returning_entity, "returning") + target_query, target_params = self._get_condition(filter.target_entity, table="e") + target_query_formatted = "AND {target_query}".format(target_query=target_query) + return_query, return_params = self._get_condition(filter.returning_entity, table="e", prepend="returning") + return_query_formatted = "AND {return_query}".format(return_query=return_query) - target_query_formatted = ( - "AND {target_query}".format(target_query=target_query) + first_event_sql = ( + REFERENCE_EVENT_UNIQUE_PEOPLE_PER_PERIOD_SQL if is_first_time_retention - else "AND ({target_query} OR {returning_query})".format( - target_query=target_query, returning_query=returning_query - ) + else REFERENCE_EVENT_PEOPLE_PER_PERIOD_SQL + ).format( + target_query=target_query_formatted, + filters=prop_filters, + trunc_func=trunc_func, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, ) - returning_query_formatted = ( - "AND {returning_query}".format(returning_query=returning_query) + default_event_query = ( + DEFAULT_REFERENCE_EVENT_UNIQUE_PEOPLE_PER_PERIOD_SQL if is_first_time_retention - else "AND ({target_query} OR {returning_query})".format( - target_query=target_query, returning_query=returning_query - ) + else DEFAULT_REFERENCE_EVENT_PEOPLE_PER_PERIOD_SQL + ).format( + target_query=target_query_formatted, + filters=prop_filters, + trunc_func=trunc_func, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, ) - reference_event_sql = (REFERENCE_EVENT_UNIQUE_SQL if is_first_time_retention else REFERENCE_EVENT_SQL).format( - target_query=target_query_formatted, filters=prop_filters, trunc_func=trunc_func, - ) - result = sync_execute( - RETENTION_SQL.format( - target_query=target_query_formatted, - returning_query=returning_query_formatted, + date_from = filter.date_from + filter.selected_interval * filter.period_increment + date_to = filter.date_to + + filter = filter.with_data({"total_intervals": filter.total_intervals - filter.selected_interval}) + + # NOTE: I'm using `Any` here to avoid typing issues when trying to iterate. + query_result: Any = sync_execute( + RETENTION_PEOPLE_PER_PERIOD_SQL.format( + returning_query=return_query_formatted, filters=prop_filters, + first_event_sql=first_event_sql, + first_event_default_sql=default_event_query, trunc_func=trunc_func, - extra_union="UNION ALL {}".format(reference_event_sql) if is_first_time_retention else "", - reference_event_sql=reference_event_sql, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, ), { "team_id": team.pk, @@ -82,41 +212,27 @@ def _execute_sql( "end_date": date_to.strftime( "%Y-%m-%d{}".format(" %H:%M:%S" if filter.period == "Hour" else " 00:00:00") ), - **prop_filter_params, - **target_params, - **returning_params, + "offset": filter.offset, + "limit": 100, "period": period, + **target_params, + **return_params, + **prop_filter_params, }, ) - result_dict = {} + people_appearances = [ + AppearanceRow(person_id=row[0], appearance_count=row[1], appearances=row[2]) for row in query_result + ] - for res in result: - result_dict.update({(res[0], res[1]): {"count": res[2], "people": []}}) + from posthog.api.person import PersonSerializer - return result_dict + people = get_persons_by_uuids(team_id=team.pk, uuids=[val[0] for val in query_result]) + people = people.prefetch_related(Prefetch("persondistinctid_set", to_attr="distinct_ids_cache")) - def _get_condition(self, target_entity: Entity, prepend: str = "") -> Tuple[str, Dict]: - if target_entity.type == TREND_FILTER_TYPE_ACTIONS: - action = Action.objects.get(pk=target_entity.id) - action_query, params = format_action_filter(action, prepend=prepend, use_loop=True) - condition = "e.uuid IN ({})".format(action_query) - elif target_entity.type == TREND_FILTER_TYPE_EVENTS: - condition = "e.event = %({}_event)s".format(prepend) - params = {"{}_event".format(prepend): target_entity.id} - else: - condition = "e.event = %({}_event)s".format(prepend) - params = {"{}_event".format(prepend): "$pageview"} - return condition, params + people_dict = {str(person.uuid): PersonSerializer(person).data for person in people} - def _get_trunc_func_ch(self, period: str) -> str: - if period == "Hour": - return PERIOD_TRUNC_HOUR - elif period == "Week": - return PERIOD_TRUNC_WEEK - elif period == "Day": - return PERIOD_TRUNC_DAY - elif period == "Month": - return PERIOD_TRUNC_MONTH - else: - raise ValueError(f"Period {period} is unsupported.") + result = self.process_people_in_period( + filter=filter, people_appearances=people_appearances, people_dict=people_dict + ) + return result diff --git a/ee/clickhouse/queries/clickhouse_session_recording.py b/ee/clickhouse/queries/clickhouse_session_recording.py index f501e90906e2c..f4fe2a8020f4e 100644 --- a/ee/clickhouse/queries/clickhouse_session_recording.py +++ b/ee/clickhouse/queries/clickhouse_session_recording.py @@ -1,55 +1,85 @@ import datetime import json -from typing import Any, Callable, List +from typing import Any, List from ee.clickhouse.client import sync_execute -from posthog.models import Team -from posthog.queries.base import BaseQuery -from posthog.queries.session_recording import SessionRecording as BaseSessionRecording -from posthog.queries.session_recording import add_session_recording_ids as _add_session_recording_ids +from posthog.models import SessionRecordingEvent, Team +from posthog.models.filters.sessions_filter import SessionsFilter +from posthog.queries.sessions.session_recording import SessionRecording as BaseSessionRecording +from posthog.queries.sessions.session_recording import join_with_session_recordings as _join_with_session_recordings +from posthog.queries.sessions.utils import cached_recording + +OPERATORS = {"gt": ">", "lt": "<"} SINGLE_RECORDING_QUERY = """ - SELECT snapshot_data + SELECT distinct_id, timestamp, snapshot_data FROM session_recording_events WHERE team_id = %(team_id)s AND session_id = %(session_id)s + ORDER BY timestamp """ -SESSIONS_RECORING_LIST_QUERY = """ +SESSIONS_IN_RANGE_QUERY = """ SELECT session_id, distinct_id, - MIN(timestamp) AS start_time, - MAX(timestamp) AS end_time - FROM session_recording_events - WHERE - team_id = %(team_id)s - AND timestamp >= %(start_time)s - AND timestamp <= %(end_time)s - GROUP BY distinct_id, session_id + start_time, + end_time, + dateDiff('second', toDateTime(start_time), toDateTime(end_time)) as duration + FROM ( + SELECT + session_id, + distinct_id, + MIN(timestamp) AS start_time, + MAX(timestamp) AS end_time, + COUNT((JSONExtractInt(snapshot_data, 'type') = 2 OR JSONExtractBool(snapshot_data, 'has_full_snapshot')) ? 1 : NULL) as full_snapshots + FROM session_recording_events + WHERE + team_id = %(team_id)s + AND timestamp >= %(start_time)s + AND timestamp <= %(end_time)s + GROUP BY distinct_id, session_id + ) + WHERE full_snapshots > 0 {filter_query} """ -SESSIONS_RECORING_LIST_QUERY_COLUMNS = ["session_id", "distinct_id", "start_time", "end_time"] +SESSIONS_IN_RANGE_QUERY_COLUMNS = ["session_id", "distinct_id", "start_time", "end_time", "duration"] class SessionRecording(BaseSessionRecording): - def query_recording_snapshots(self, team: Team, session_id: str) -> List[Any]: - response = sync_execute(SINGLE_RECORDING_QUERY, {"team_id": team.id, "session_id": session_id}) - return [json.loads(row[0]) for row in response] + def query_recording_snapshots(self) -> List[SessionRecordingEvent]: + response = sync_execute( + SINGLE_RECORDING_QUERY, {"team_id": self._team.id, "session_id": self._session_recording_id,}, + ) + return [ + SessionRecordingEvent(distinct_id=distinct_id, timestamp=timestamp, snapshot_data=json.loads(snapshot_data)) + for distinct_id, timestamp, snapshot_data in response + ] + + +def join_with_session_recordings(team: Team, sessions_results: List[Any], filter: SessionsFilter) -> List[Any]: + return _join_with_session_recordings(team, sessions_results, filter, query=query_sessions_in_range) -def add_session_recording_ids(team: Team, sessions_results: List[Any]) -> List[Any]: - return _add_session_recording_ids(team, sessions_results, query=query_sessions_in_range) +def query_sessions_in_range( + team: Team, start_time: datetime.datetime, end_time: datetime.datetime, filter: SessionsFilter +) -> List[dict]: + filter_query, filter_params = "", {} + if filter.recording_duration_filter: + filter_query = f"AND duration {OPERATORS[filter.recording_duration_filter.operator]} %(min_recording_duration)s" # type: ignore + filter_params = { + "min_recording_duration": filter.recording_duration_filter.value, + } -def query_sessions_in_range(team: Team, start_time: datetime.datetime, end_time: datetime.datetime) -> List[dict]: results = sync_execute( - SESSIONS_RECORING_LIST_QUERY, + SESSIONS_IN_RANGE_QUERY.format(filter_query=filter_query), { "team_id": team.id, "start_time": start_time.strftime("%Y-%m-%d %H:%M:%S.%f"), "end_time": end_time.strftime("%Y-%m-%d %H:%M:%S.%f"), + **filter_params, }, ) - return [dict(zip(SESSIONS_RECORING_LIST_QUERY_COLUMNS, row)) for row in results] + return [dict(zip(SESSIONS_IN_RANGE_QUERY_COLUMNS, row)) for row in results] diff --git a/ee/clickhouse/queries/clickhouse_stickiness.py b/ee/clickhouse/queries/clickhouse_stickiness.py index 696dc4b95a93b..2becf4143382f 100644 --- a/ee/clickhouse/queries/clickhouse_stickiness.py +++ b/ee/clickhouse/queries/clickhouse_stickiness.py @@ -1,31 +1,47 @@ -from typing import Any, Dict, Optional +from datetime import datetime +from typing import Any, Dict, Tuple + +from django.conf import settings +from django.db.models.expressions import F +from django.utils import timezone +from rest_framework.request import Request +from rest_framework.utils.serializer_helpers import ReturnDict +from sentry_sdk.api import capture_exception from ee.clickhouse.client import sync_execute from ee.clickhouse.models.action import format_action_filter +from ee.clickhouse.models.person import ClickhousePersonSerializer from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.util import parse_timestamps +from ee.clickhouse.queries.util import get_trunc_func_ch, parse_timestamps +from ee.clickhouse.sql.person import ( + GET_LATEST_PERSON_SQL, + GET_TEAM_PERSON_DISTINCT_IDS, + INSERT_COHORT_ALL_PEOPLE_SQL, + PEOPLE_SQL, + PERSON_STATIC_COHORT_TABLE, +) from ee.clickhouse.sql.stickiness.stickiness import STICKINESS_SQL from ee.clickhouse.sql.stickiness.stickiness_actions import STICKINESS_ACTIONS_SQL +from ee.clickhouse.sql.stickiness.stickiness_people import STICKINESS_PEOPLE_SQL from posthog.constants import TREND_FILTER_TYPE_ACTIONS -from posthog.models.action import Action +from posthog.models.cohort import Cohort from posthog.models.entity import Entity -from posthog.models.filter import Filter +from posthog.models.filters.stickiness_filter import StickinessFilter +from posthog.models.team import Team from posthog.queries.stickiness import Stickiness class ClickhouseStickiness(Stickiness): - def stickiness(self, entity: Entity, filter: Filter, team_id: int) -> Dict[str, Any]: - if not filter.date_to or not filter.date_from: - raise ValueError("_stickiness needs date_to and date_from set") - range_days = (filter.date_to - filter.date_from).days + 2 + def stickiness(self, entity: Entity, filter: StickinessFilter, team_id: int) -> Dict[str, Any]: - parsed_date_from, parsed_date_to = parse_timestamps(filter=filter) - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team_id) + parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter=filter, team_id=team_id) + prop_filters, prop_filter_params = parse_prop_clauses(filter.properties + entity.properties, team_id) + trunc_func = get_trunc_func_ch(filter.interval) params: Dict = {"team_id": team_id} - params = {**params, **prop_filter_params} + params = {**params, **prop_filter_params, "num_intervals": filter.total_intervals, **date_params} if entity.type == TREND_FILTER_TYPE_ACTIONS: - action = Action.objects.get(pk=entity.id) + action = entity.get_action() action_query, action_params = format_action_filter(action) if action_query == "": return {} @@ -37,6 +53,8 @@ def stickiness(self, entity: Entity, filter: Filter, team_id: int) -> Dict[str, parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, filters=prop_filters, + trunc_func=trunc_func, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, ) else: content_sql = STICKINESS_SQL.format( @@ -45,7 +63,93 @@ def stickiness(self, entity: Entity, filter: Filter, team_id: int) -> Dict[str, parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to, filters=prop_filters, + trunc_func=trunc_func, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, ) counts = sync_execute(content_sql, params) - return self.process_result(counts, range_days) + return self.process_result(counts, filter) + + def _retrieve_people( + self, target_entity: Entity, filter: StickinessFilter, team: Team, request: Request + ) -> ReturnDict: + return retrieve_stickiness_people(target_entity, filter, team) + + +def _format_entity_filter(entity: Entity) -> Tuple[str, Dict]: + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = entity.get_action() + action_query, params = format_action_filter(action) + entity_filter = "AND {}".format(action_query) + else: + entity_filter = "AND event = %(event)s" + params = {"event": entity.id} + + return entity_filter, params + + +def _process_content_sql(target_entity: Entity, filter: StickinessFilter, team: Team) -> Tuple[str, Dict[str, Any]]: + parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter=filter, team_id=team.pk) + prop_filters, prop_filter_params = parse_prop_clauses(filter.properties + target_entity.properties, team.pk) + entity_sql, entity_params = _format_entity_filter(entity=target_entity) + trunc_func = get_trunc_func_ch(filter.interval) + + params: Dict = { + "team_id": team.pk, + **prop_filter_params, + "stickiness_day": filter.selected_interval, + **entity_params, + "offset": filter.offset, + **date_params, + } + + content_sql = STICKINESS_PEOPLE_SQL.format( + entity_filter=entity_sql, + parsed_date_from=parsed_date_from, + parsed_date_to=parsed_date_to, + filters=prop_filters, + trunc_func=trunc_func, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ) + return content_sql, params + + +def retrieve_stickiness_people(target_entity: Entity, filter: StickinessFilter, team: Team) -> ReturnDict: + + content_sql, params = _process_content_sql(target_entity, filter, team) + + people = sync_execute( + PEOPLE_SQL.format( + content_sql=content_sql, + query="", + latest_person_sql=GET_LATEST_PERSON_SQL.format(query=""), + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ), + params, + ) + return ClickhousePersonSerializer(people, many=True).data + + +def insert_stickiness_people_into_cohort(cohort: Cohort, target_entity: Entity, filter: StickinessFilter) -> None: + content_sql, params = _process_content_sql(target_entity, filter, cohort.team) + try: + sync_execute( + INSERT_COHORT_ALL_PEOPLE_SQL.format( + content_sql=content_sql, + latest_person_sql=GET_LATEST_PERSON_SQL.format(query=""), + cohort_table=PERSON_STATIC_COHORT_TABLE, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ), + {"cohort_id": cohort.pk, "_timestamp": datetime.now(), **params}, + ) + cohort.is_calculating = False + cohort.last_calculation = timezone.now() + cohort.errors_calculating = 0 + cohort.save() + except Exception as err: + if settings.DEBUG: + raise err + cohort.is_calculating = False + cohort.errors_calculating = F("errors_calculating") + 1 + cohort.save() + capture_exception(err) diff --git a/ee/clickhouse/queries/column_optimizer.py b/ee/clickhouse/queries/column_optimizer.py new file mode 100644 index 0000000000000..d469a7b50ea4d --- /dev/null +++ b/ee/clickhouse/queries/column_optimizer.py @@ -0,0 +1,128 @@ +from typing import Counter, List, Set, Union, cast + +from ee.clickhouse.materialized_columns.columns import ColumnName, get_materialized_columns +from ee.clickhouse.models.action import get_action_tables_and_properties, uses_elements_chain +from ee.clickhouse.models.property import extract_tables_and_properties +from posthog.constants import TREND_FILTER_TYPE_ACTIONS, FunnelCorrelationType +from posthog.models.entity import Entity +from posthog.models.filters import Filter +from posthog.models.filters.mixins.utils import cached_property +from posthog.models.filters.path_filter import PathFilter +from posthog.models.filters.retention_filter import RetentionFilter +from posthog.models.property import GroupTypeIndex, PropertyIdentifier, PropertyType, TableWithProperties + + +class ColumnOptimizer: + """ + This class is responsible for figuring out what columns can and should be materialized based on the query filter. + + This speeds up queries since clickhouse ends up selecting less data. + """ + + def __init__(self, filter: Union[Filter, PathFilter, RetentionFilter], team_id: int): + self.filter = filter + self.team_id = team_id + + @cached_property + def event_columns_to_query(self) -> Set[ColumnName]: + "Returns a list of event table columns containing materialized properties that this query needs" + + return self.columns_to_query("events", set(self._used_properties_with_type("event"))) + + @cached_property + def person_columns_to_query(self) -> Set[ColumnName]: + "Returns a list of person table columns containing materialized properties that this query needs" + + return self.columns_to_query("person", set(self._used_properties_with_type("person"))) + + def columns_to_query(self, table: TableWithProperties, used_properties: Set[PropertyIdentifier]) -> Set[ColumnName]: + "Transforms a list of property names to what columns are needed for that query" + + materialized_columns = get_materialized_columns(table) + return set(materialized_columns.get(property_name, "properties") for property_name, _, _ in used_properties) + + @cached_property + def is_using_person_properties(self) -> bool: + return len(self._used_properties_with_type("person")) > 0 + + @cached_property + def group_types_to_query(self) -> Set[GroupTypeIndex]: + used_properties = self._used_properties_with_type("group") + return set(cast(int, group_type_index) for _, _, group_type_index in used_properties) + + @cached_property + def should_query_elements_chain_column(self) -> bool: + "Returns whether this query uses elements_chain" + has_element_type_property = lambda properties: any(prop.type == "element" for prop in properties) + + if has_element_type_property(self.filter.properties): + return True + + # Both entities and funnel exclusions can contain nested elements_chain inclusions + for entity in self.filter.entities + cast(List[Entity], self.filter.exclusions): + if has_element_type_property(entity.properties): + return True + + # :TRICKY: Action definition may contain elements_chain usage + # + # See ee/clickhouse/models/action.py#format_action_filter for an example + if entity.type == TREND_FILTER_TYPE_ACTIONS: + if uses_elements_chain(entity.get_action()): + return True + + return False + + @cached_property + def properties_used_in_filter(self) -> Counter[PropertyIdentifier]: + "Returns collection of properties + types that this query would use" + counter: Counter[PropertyIdentifier] = extract_tables_and_properties(self.filter.properties) + + # Some breakdown types read properties + # + # See ee/clickhouse/queries/trends/breakdown.py#get_query or + # ee/clickhouse/queries/breakdown_props.py#get_breakdown_prop_values + if self.filter.breakdown_type in ["event", "person"]: + # :TRICKY: We only support string breakdown for event/person properties + assert isinstance(self.filter.breakdown, str) + counter[(self.filter.breakdown, self.filter.breakdown_type, None)] += 1 + elif self.filter.breakdown_type == "group": + # :TRICKY: We only support string breakdown for group properties + assert isinstance(self.filter.breakdown, str) + counter[(self.filter.breakdown, self.filter.breakdown_type, self.filter.breakdown_group_type_index)] += 1 + + # Both entities and funnel exclusions can contain nested property filters + for entity in self.filter.entities + cast(List[Entity], self.filter.exclusions): + counter += extract_tables_and_properties(entity.properties) + + # Math properties are also implicitly used. + # + # See ee/clickhouse/queries/trends/util.py#process_math + if entity.math_property: + counter[(entity.math_property, "event", None)] += 1 + + # If groups are involved, they're also used + # + # See ee/clickhouse/queries/trends/util.py#process_math + if entity.math == "unique_group": + counter[(f"$group_{entity.math_group_type_index}", "event", None)] += 1 + + # :TRICKY: If action contains property filters, these need to be included + # + # See ee/clickhouse/models/action.py#format_action_filter for an example + if entity.type == TREND_FILTER_TYPE_ACTIONS: + counter += get_action_tables_and_properties(entity.get_action()) + + if self.filter.correlation_type == FunnelCorrelationType.PROPERTIES and self.filter.correlation_property_names: + for prop_value in self.filter.correlation_property_names: + counter[(prop_value, "person", None)] += 1 + + return counter + + def _used_properties_with_type(self, property_type: PropertyType) -> Counter[PropertyIdentifier]: + return Counter( + { + (name, type, group_type_index): count + for (name, type, group_type_index), count in self.properties_used_in_filter.items() + if type == property_type + } + ) diff --git a/ee/clickhouse/queries/event_query.py b/ee/clickhouse/queries/event_query.py new file mode 100644 index 0000000000000..a54c7a9d52246 --- /dev/null +++ b/ee/clickhouse/queries/event_query.py @@ -0,0 +1,191 @@ +from abc import ABCMeta, abstractmethod +from typing import Any, Dict, List, Tuple, Union + +from ee.clickhouse.materialized_columns.columns import ColumnName +from ee.clickhouse.models.cohort import format_person_query, format_precalculated_cohort_query, is_precalculated_query +from ee.clickhouse.models.property import get_property_string_expr, parse_prop_clauses +from ee.clickhouse.models.util import PersonPropertiesMode +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from ee.clickhouse.queries.groups_join_query import GroupsJoinQuery +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from ee.clickhouse.queries.util import parse_timestamps +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from posthog.models import Cohort, Filter, Property +from posthog.models.filters.path_filter import PathFilter +from posthog.models.filters.retention_filter import RetentionFilter + + +class ClickhouseEventQuery(metaclass=ABCMeta): + DISTINCT_ID_TABLE_ALIAS = "pdi" + PERSON_TABLE_ALIAS = "person" + EVENT_TABLE_ALIAS = "e" + + _filter: Union[Filter, PathFilter, RetentionFilter] + _team_id: int + _column_optimizer: ColumnOptimizer + _person_query: ClickhousePersonQuery + _should_join_distinct_ids = False + _should_join_persons = False + _should_round_interval = False + _extra_fields: List[ColumnName] + _extra_person_fields: List[ColumnName] + + def __init__( + self, + filter: Union[Filter, PathFilter, RetentionFilter], + team_id: int, + round_interval=False, + should_join_distinct_ids=False, + should_join_persons=False, + # Extra events/person table columns to fetch since parent query needs them + extra_fields: List[ColumnName] = [], + extra_person_fields: List[ColumnName] = [], + **kwargs, + ) -> None: + self._filter = filter + self._team_id = team_id + self._column_optimizer = ColumnOptimizer(self._filter, self._team_id) + self._person_query = ClickhousePersonQuery( + self._filter, self._team_id, self._column_optimizer, extra_fields=extra_person_fields + ) + self.params: Dict[str, Any] = { + "team_id": self._team_id, + } + + self._should_join_distinct_ids = should_join_distinct_ids + self._should_join_persons = should_join_persons + self._extra_fields = extra_fields + self._extra_person_fields = extra_person_fields + + if not self._should_join_distinct_ids: + self._determine_should_join_distinct_ids() + + if not self._should_join_persons: + self._determine_should_join_persons() + + self._should_round_interval = round_interval + + @abstractmethod + def get_query(self) -> Tuple[str, Dict[str, Any]]: + pass + + @abstractmethod + def _determine_should_join_distinct_ids(self) -> None: + pass + + def _get_disintct_id_query(self) -> str: + if self._should_join_distinct_ids: + return f""" + INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS {self.DISTINCT_ID_TABLE_ALIAS} + ON events.distinct_id = {self.DISTINCT_ID_TABLE_ALIAS}.distinct_id + """ + else: + return "" + + def _determine_should_join_persons(self) -> None: + if self._person_query.is_used: + self._should_join_distinct_ids = True + self._should_join_persons = True + return + + # :KLUDGE: The following is mostly making sure if cohorts are included as well. + # Can be simplified significantly after https://github.com/PostHog/posthog/issues/5854 + if any(self._should_property_join_persons(prop) for prop in self._filter.properties): + self._should_join_distinct_ids = True + self._should_join_persons = True + return + + if any( + self._should_property_join_persons(prop) for entity in self._filter.entities for prop in entity.properties + ): + self._should_join_distinct_ids = True + self._should_join_persons = True + return + + if self._filter.breakdown_type == "person": + self._should_join_distinct_ids = True + self._should_join_persons = True + return + + def _should_property_join_persons(self, prop: Property) -> bool: + return prop.type == "cohort" and self._does_cohort_need_persons(prop) + + def _does_cohort_need_persons(self, prop: Property) -> bool: + try: + cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=self._team_id) + except Cohort.DoesNotExist: + return False + if is_precalculated_query(cohort): + return True + if cohort.is_static: + return True + for group in cohort.groups: + if group.get("properties"): + return True + return False + + def _get_person_query(self) -> Tuple[str, Dict]: + if self._should_join_persons: + person_query, params = self._person_query.get_query() + return ( + f""" + INNER JOIN ({person_query}) {self.PERSON_TABLE_ALIAS} + ON {self.PERSON_TABLE_ALIAS}.id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id + """, + params, + ) + else: + return "", {} + + def _get_groups_query(self) -> Tuple[str, Dict]: + return GroupsJoinQuery(self._filter, self._team_id, self._column_optimizer).get_join_query() + + def _get_date_filter(self) -> Tuple[str, Dict]: + + parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter=self._filter, team_id=self._team_id) + + query = f""" + {parsed_date_from} + {parsed_date_to} + """ + + return query, date_params + + def _get_props(self, filters: List[Property]) -> Tuple[str, Dict]: + final = [] + params: Dict[str, Any] = {} + + for idx, prop in enumerate(filters): + if prop.type == "cohort": + person_id_query, cohort_filter_params = self._get_cohort_subquery(prop) + params = {**params, **cohort_filter_params} + final.append(f"AND {person_id_query}") + else: + filter_query, filter_params = parse_prop_clauses( + [prop], + self._team_id, + prepend=f"global_{idx}", + allow_denormalized_props=True, + person_properties_mode=PersonPropertiesMode.EXCLUDE, + ) + final.append(filter_query) + params.update(filter_params) + return " ".join(final), params + + def _get_cohort_subquery(self, prop) -> Tuple[str, Dict[str, Any]]: + try: + cohort: Cohort = Cohort.objects.get(pk=prop.value, team_id=self._team_id) + except Cohort.DoesNotExist: + return "0 = 11", {} # If cohort doesn't exist, nothing can match + + is_precalculated = is_precalculated_query(cohort) + + person_id_query, cohort_filter_params = ( + format_precalculated_cohort_query( + cohort.pk, 0, custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id" + ) + if is_precalculated + else format_person_query(cohort, 0, custom_match_field=f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id") + ) + + return person_id_query, cohort_filter_params diff --git a/ee/clickhouse/queries/funnels/__init__.py b/ee/clickhouse/queries/funnels/__init__.py new file mode 100644 index 0000000000000..aaa34917d20f7 --- /dev/null +++ b/ee/clickhouse/queries/funnels/__init__.py @@ -0,0 +1,8 @@ +from .base import ClickhouseFunnelBase +from .funnel import ClickhouseFunnel +from .funnel_persons import ClickhouseFunnelPersons +from .funnel_strict import ClickhouseFunnelStrict +from .funnel_time_to_convert import ClickhouseFunnelTimeToConvert +from .funnel_trends import ClickhouseFunnelTrends +from .funnel_trends_persons import ClickhouseFunnelTrendsPersons +from .funnel_unordered import ClickhouseFunnelUnordered diff --git a/ee/clickhouse/queries/funnels/base.py b/ee/clickhouse/queries/funnels/base.py new file mode 100644 index 0000000000000..4672e4be878bb --- /dev/null +++ b/ee/clickhouse/queries/funnels/base.py @@ -0,0 +1,515 @@ +from abc import ABC +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +from django.utils import timezone +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.action import format_action_filter +from ee.clickhouse.models.property import get_property_string_expr, parse_prop_clauses +from ee.clickhouse.models.util import PersonPropertiesMode +from ee.clickhouse.queries.breakdown_props import format_breakdown_cohort_join_query, get_breakdown_prop_values +from ee.clickhouse.queries.funnels.funnel_event_query import FunnelEventQuery +from ee.clickhouse.sql.funnels.funnel import FUNNEL_INNER_EVENT_STEPS_QUERY +from posthog.constants import FUNNEL_WINDOW_INTERVAL, FUNNEL_WINDOW_INTERVAL_UNIT, LIMIT, TREND_FILTER_TYPE_ACTIONS +from posthog.models import Entity, Filter, Team +from posthog.queries.funnel import Funnel +from posthog.utils import relative_date_parse + + +class ClickhouseFunnelBase(ABC, Funnel): + _filter: Filter + _team: Team + _include_timestamp: Optional[bool] + _include_preceding_timestamp: Optional[bool] + _no_person_limit: Optional[bool] # used when paths are querying for filter people + + def __init__( + self, + filter: Filter, + team: Team, + include_timestamp: Optional[bool] = None, + include_preceding_timestamp: Optional[bool] = None, + no_person_limit: Optional[bool] = False, + ) -> None: + self._filter = filter + self._team = team + self.params = { + "team_id": self._team.pk, + "events": [], # purely a speed optimization, don't need this for filtering + } + self._include_timestamp = include_timestamp + self._include_preceding_timestamp = include_preceding_timestamp + + # handle default if window isn't provided + if not self._filter.funnel_window_days and not self._filter.funnel_window_interval: + self._filter = self._filter.with_data({FUNNEL_WINDOW_INTERVAL: 14, FUNNEL_WINDOW_INTERVAL_UNIT: "day"}) + + if self._filter.funnel_window_days: + self._filter = self._filter.with_data( + {FUNNEL_WINDOW_INTERVAL: self._filter.funnel_window_days, FUNNEL_WINDOW_INTERVAL_UNIT: "day"} + ) + + if not self._filter.limit: + new_limit = {LIMIT: 100} + self._filter = self._filter.with_data(new_limit) + self.params.update(new_limit) + + self._update_filters() + + self._no_person_limit = no_person_limit + + def run(self, *args, **kwargs): + if len(self._filter.entities) == 0: + return [] + + results = self._exec_query() + return self._format_results(results) + + def _update_filters(self): + # format default dates + data: Dict[str, Any] = {} + if not self._filter._date_from: + data.update({"date_from": relative_date_parse("-7d")}) + if not self._filter._date_to: + data.update({"date_to": timezone.now()}) + + if self._filter.breakdown and not self._filter.breakdown_type: + data.update({"breakdown_type": "event"}) + + for exclusion in self._filter.exclusions: + if exclusion.funnel_from_step is None or exclusion.funnel_to_step is None: + raise ValidationError("Exclusion event needs to define funnel steps") + + if exclusion.funnel_from_step >= exclusion.funnel_to_step: + raise ValidationError("Exclusion event range is invalid. End of range should be greater than start.") + + if exclusion.funnel_from_step >= len(self._filter.entities) - 1: + raise ValidationError( + "Exclusion event range is invalid. Start of range is greater than number of steps." + ) + + if exclusion.funnel_to_step > len(self._filter.entities) - 1: + raise ValidationError("Exclusion event range is invalid. End of range is greater than number of steps.") + + for entity in self._filter.entities[exclusion.funnel_from_step : exclusion.funnel_to_step + 1]: + if entity.equals(exclusion) or exclusion.is_superset(entity): + raise ValidationError("Exclusion event can't be the same as funnel step") + + self._filter = self._filter.with_data(data) + + def _format_single_funnel(self, results, with_breakdown=False): + # Format of this is [step order, person count (that reached that step), array of person uuids] + steps = [] + total_people = 0 + + for step in reversed(self._filter.entities): + + if results and len(results) > 0: + total_people += results[step.order] + + serialized_result = self._serialize_step(step, total_people, []) + if cast(int, step.order) > 0: + serialized_result.update( + { + "average_conversion_time": results[cast(int, step.order) + len(self._filter.entities) - 1], + "median_conversion_time": results[cast(int, step.order) + len(self._filter.entities) * 2 - 2], + } + ) + else: + serialized_result.update({"average_conversion_time": None, "median_conversion_time": None}) + + if with_breakdown: + serialized_result.update({"breakdown": results[-1], "breakdown_value": results[-1]}) + # important to not try and modify this value any how - as these are keys for fetching persons + + steps.append(serialized_result) + + return steps[::-1] #  reverse + + def _format_results(self, results): + if not results or len(results) == 0: + return [] + + if self._filter.breakdown: + return [self._format_single_funnel(res, with_breakdown=True) for res in results] + else: + return self._format_single_funnel(results[0]) + + def _exec_query(self) -> List[Tuple]: + query = self.get_query() + return sync_execute(query, self.params) + + def _get_timestamp_outer_select(self) -> str: + if self._include_preceding_timestamp: + return ", max_timestamp, min_timestamp" + elif self._include_timestamp: + return ", timestamp" + else: + return "" + + def _get_timestamp_selects(self) -> Tuple[str, str]: + """ + Returns timestamp selectors for the target step and optionally the preceding step. + In the former case, always returns the timestamp for the first and last step as well. + """ + target_step = self._filter.funnel_step + final_step = len(self._filter.entities) - 1 + first_step = 0 + + if not target_step: + return "", "" + + if target_step < 0: + # the first valid dropoff argument for funnel_step is -2 + # -2 refers to persons who performed the first step but never made it to the second + if target_step == -1: + raise ValueError("To request dropoff of initial step use -2") + + target_step = abs(target_step) - 2 + else: + target_step -= 1 + + if self._include_preceding_timestamp: + + if target_step == 0: + raise ValueError("Cannot request preceding step timestamp if target funnel step is the first step") + + return ( + f", latest_{target_step}, latest_{target_step - 1}", + f", argMax(latest_{target_step}, steps) as max_timestamp, argMax(latest_{target_step - 1}, steps) as min_timestamp", + ) + elif self._include_timestamp: + return ( + f", latest_{target_step}, latest_{final_step}, latest_{first_step}", + f", argMax(latest_{target_step}, steps) as timestamp, argMax(latest_{final_step}, steps) as final_timestamp, argMax(latest_{first_step}, steps) as first_timestamp", + ) + else: + return "", "" + + def _get_step_times(self, max_steps: int): + conditions: List[str] = [] + for i in range(1, max_steps): + conditions.append( + f"if(isNotNull(latest_{i}) AND latest_{i} <= latest_{i-1} + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, " + f"dateDiff('second', toDateTime(latest_{i - 1}), toDateTime(latest_{i})), NULL) step_{i}_conversion_time" + ) + + formatted = ", ".join(conditions) + return f", {formatted}" if formatted else "" + + def _get_partition_cols(self, level_index: int, max_steps: int): + cols: List[str] = [] + for i in range(0, max_steps): + cols.append(f"step_{i}") + if i < level_index: + cols.append(f"latest_{i}") + for exclusion_id, exclusion in enumerate(self._filter.exclusions): + if cast(int, exclusion.funnel_from_step) + 1 == i: + cols.append(f"exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}") + else: + duplicate_event = 0 + if i > 0 and ( + self._filter.entities[i].equals(self._filter.entities[i - 1]) + or self._filter.entities[i].is_superset(self._filter.entities[i - 1]) + ): + duplicate_event = 1 + cols.append( + f"min(latest_{i}) over (PARTITION by aggregation_target {self._get_breakdown_prop()} ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND {duplicate_event} PRECEDING) latest_{i}" + ) + for exclusion_id, exclusion in enumerate(self._filter.exclusions): + # exclusion starting at step i follows semantics of step i+1 in the query (since we're looking for exclusions after step i) + if cast(int, exclusion.funnel_from_step) + 1 == i: + cols.append( + f"min(exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}) over (PARTITION by aggregation_target {self._get_breakdown_prop()} ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}" + ) + return ", ".join(cols) + + def _get_exclusion_condition(self): + if not self._filter.exclusions: + return "" + + conditions = [] + for exclusion_id, exclusion in enumerate(self._filter.exclusions): + from_time = f"latest_{exclusion.funnel_from_step}" + to_time = f"latest_{exclusion.funnel_to_step}" + exclusion_time = f"exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}" + condition = ( + f"if( {exclusion_time} > {from_time} AND {exclusion_time} < " + f"if(isNull({to_time}), {from_time} + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, {to_time}), 1, 0)" + ) + conditions.append(condition) + + if conditions: + return f", arraySum([{','.join(conditions)}]) as exclusion" + else: + return "" + + def _get_sorting_condition(self, curr_index: int, max_steps: int): + + if curr_index == 1: + return "1" + + conditions: List[str] = [] + for i in range(1, curr_index): + conditions.append(f"latest_{i - 1} < latest_{i }") + conditions.append( + f"latest_{i} <= latest_0 + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}" + ) + + return f"if({' AND '.join(conditions)}, {curr_index}, {self._get_sorting_condition(curr_index - 1, max_steps)})" + + def _get_inner_event_query( + self, entities=None, entity_name="events", skip_entity_filter=False, skip_step_filter=False + ) -> str: + entities_to_use = entities or self._filter.entities + + event_query, params = FunnelEventQuery(filter=self._filter, team_id=self._team.pk).get_query( + entities_to_use, entity_name, skip_entity_filter=skip_entity_filter + ) + + self.params.update(params) + + if skip_step_filter: + steps_conditions = "1=1" + else: + steps_conditions = self._get_steps_conditions(length=len(entities_to_use)) + + all_step_cols: List[str] = [] + for index, entity in enumerate(entities_to_use): + step_cols = self._get_step_col(entity, index, entity_name) + all_step_cols.extend(step_cols) + + for exclusion_id, entity in enumerate(self._filter.exclusions): + step_cols = self._get_step_col(entity, entity.funnel_from_step, entity_name, f"exclusion_{exclusion_id}_") + # every exclusion entity has the form: exclusion__step_i & timestamp exclusion__latest_i + # where i is the starting step for exclusion on that entity + all_step_cols.extend(step_cols) + + steps = ", ".join(all_step_cols) + + select_prop = self._get_breakdown_select_prop() + breakdown_conditions = "" + extra_conditions = "" + extra_join = "" + + if self._filter.breakdown: + if self._filter.breakdown_type == "cohort": + extra_join = self._get_cohort_breakdown_join() + else: + breakdown_conditions = self._get_breakdown_conditions() + extra_conditions = f" AND {breakdown_conditions}" if breakdown_conditions and select_prop else "" + + return FUNNEL_INNER_EVENT_STEPS_QUERY.format( + steps=steps, + event_query=event_query, + extra_join=extra_join, + steps_condition=steps_conditions, + select_prop=select_prop, + extra_conditions=extra_conditions, + ) + + def _get_steps_conditions(self, length: int) -> str: + step_conditions: List[str] = [] + + for index in range(length): + step_conditions.append(f"step_{index} = 1") + + for exclusion_id, entity in enumerate(self._filter.exclusions): + step_conditions.append(f"exclusion_{exclusion_id}_step_{entity.funnel_from_step} = 1") + + return " OR ".join(step_conditions) + + def _get_step_col(self, entity: Entity, index: int, entity_name: str, step_prefix: str = "") -> List[str]: + # step prefix is used to distinguish actual steps, and exclusion steps + # without the prefix, we get the same parameter binding for both, which borks things up + step_cols: List[str] = [] + condition = self._build_step_query(entity, index, entity_name, step_prefix) + step_cols.append(f"if({condition}, 1, 0) as {step_prefix}step_{index}") + step_cols.append(f"if({step_prefix}step_{index} = 1, timestamp, null) as {step_prefix}latest_{index}") + + return step_cols + + def _build_step_query(self, entity: Entity, index: int, entity_name: str, step_prefix: str) -> str: + filters = self._build_filters(entity, index) + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = entity.get_action() + for action_step in action.steps.all(): + if entity_name not in self.params[entity_name]: + self.params[entity_name].append(action_step.event) + action_query, action_params = format_action_filter(action, f"{entity_name}_{step_prefix}step_{index}") + if action_query == "": + return "" + + self.params.update(action_params) + content_sql = "{actions_query} {filters}".format(actions_query=action_query, filters=filters,) + else: + if entity.id not in self.params[entity_name]: + self.params[entity_name].append(entity.id) + event_param_key = f"{entity_name}_{step_prefix}event_{index}" + self.params[event_param_key] = entity.id + content_sql = f"event = %({event_param_key})s {filters}" + return content_sql + + def _build_filters(self, entity: Entity, index: int) -> str: + prop_filters, prop_filter_params = parse_prop_clauses( + entity.properties, + self._team.pk, + prepend=str(index), + person_properties_mode=PersonPropertiesMode.USING_PERSON_PROPERTIES_COLUMN, + ) + self.params.update(prop_filter_params) + if entity.properties: + return prop_filters + return "" + + def _get_funnel_person_step_condition(self): + step_num = self._filter.funnel_step + custom_steps = self._filter.funnel_custom_steps + max_steps = len(self._filter.entities) + + conditions = [] + + if custom_steps: + self.params.update({"custom_step_num": custom_steps}) + conditions.append("steps IN %(custom_step_num)s") + elif step_num is not None: + if step_num >= 0: + self.params.update({"step_num": [i for i in range(step_num, max_steps + 1)]}) + conditions.append("steps IN %(step_num)s") + else: + self.params.update({"step_num": abs(step_num) - 1}) + conditions.append("steps = %(step_num)s") + else: + raise ValueError("Missing both funnel_step and funnel_custom_steps") + + if self._filter.funnel_step_breakdown is not None: + prop_vals = self._parse_breakdown_prop_value() + self.params.update({"breakdown_prop_value": prop_vals}) + conditions.append("prop IN %(breakdown_prop_value)s") + + return " AND ".join(conditions) + + def _parse_breakdown_prop_value(self): + prop_vals: List[Union[str, int]] = ( + [val.strip() for val in self._filter.funnel_step_breakdown.split(",")] + if isinstance(self._filter.funnel_step_breakdown, str) + else [cast(int, self._filter.funnel_step_breakdown)] + ) + return prop_vals + + def _get_count_columns(self, max_steps: int): + cols: List[str] = [] + + for i in range(max_steps): + cols.append(f"countIf(steps = {i + 1}) step_{i + 1}") + + return ", ".join(cols) + + def _get_step_time_names(self, max_steps: int): + names = [] + for i in range(1, max_steps): + names.append(f"step_{i}_conversion_time") + + formatted = ",".join(names) + return f", {formatted}" if formatted else "" + + def _get_step_time_avgs(self, max_steps: int, inner_query: bool = False): + conditions: List[str] = [] + for i in range(1, max_steps): + conditions.append( + f"avg(step_{i}_conversion_time) step_{i}_average_conversion_time_inner" + if inner_query + else f"avg(step_{i}_average_conversion_time_inner) step_{i}_average_conversion_time" + ) + + formatted = ", ".join(conditions) + return f", {formatted}" if formatted else "" + + def _get_step_time_median(self, max_steps: int, inner_query: bool = False): + conditions: List[str] = [] + for i in range(1, max_steps): + conditions.append( + f"median(step_{i}_conversion_time) step_{i}_median_conversion_time_inner" + if inner_query + else f"median(step_{i}_median_conversion_time_inner) step_{i}_median_conversion_time" + ) + + formatted = ", ".join(conditions) + return f", {formatted}" if formatted else "" + + def get_query(self) -> str: + raise NotImplementedError() + + def get_step_counts_query(self) -> str: + raise NotImplementedError() + + def get_step_counts_without_aggregation_query(self) -> str: + raise NotImplementedError() + + def _get_breakdown_select_prop(self) -> str: + if self._filter.breakdown: + self.params.update({"breakdown": self._filter.breakdown}) + if self._filter.breakdown_type == "person": + # :TRICKY: We only support string breakdown for event/person properties + assert isinstance(self._filter.breakdown, str) + expression, _ = get_property_string_expr( + "person", self._filter.breakdown, "%(breakdown)s", "person_props" + ) + return f", {expression} AS prop" + elif self._filter.breakdown_type == "event": + # :TRICKY: We only support string breakdown for event/person properties + assert isinstance(self._filter.breakdown, str) + expression, _ = get_property_string_expr( + "events", self._filter.breakdown, "%(breakdown)s", "properties" + ) + return f", {expression} AS prop" + elif self._filter.breakdown_type == "cohort": + return ", value AS prop" + elif self._filter.breakdown_type == "group": + # :TRICKY: We only support string breakdown for group properties + assert isinstance(self._filter.breakdown, str) + properties_field = f"group_properties_{self._filter.breakdown_group_type_index}" + expression, _ = get_property_string_expr( + "groups", self._filter.breakdown, "%(breakdown)s", properties_field + ) + return f", {expression} AS prop" + + return "" + + def _get_cohort_breakdown_join(self) -> str: + cohort_queries, ids, cohort_params = format_breakdown_cohort_join_query(self._team.pk, self._filter) + self.params.update({"breakdown_values": ids}) + self.params.update(cohort_params) + return f""" + INNER JOIN ( + {cohort_queries} + ) cohort_join + ON events.distinct_id = cohort_join.distinct_id + """ + + def _get_breakdown_conditions(self) -> str: + if self._filter.breakdown: + limit = self._filter.breakdown_limit_or_default + first_entity = self._filter.entities[0] + + values = get_breakdown_prop_values( + self._filter, first_entity, "count(*)", self._team.pk, limit, extra_params={"offset": 0} + ) + # For people, pagination sets the offset param, which is common across filters + # and gives us the wrong breakdown values here, so we override it. + # For events, we assume breakdown values remain stable across the funnel, + # so using just the first entity to get breakdown values is ok. + + self.params.update({"breakdown_values": values}) + + return "" + + def _get_breakdown_prop(self, group_remaining=False) -> str: + if self._filter.breakdown: + if group_remaining and self._filter.breakdown_type != "cohort": + return ", if(has(%(breakdown_values)s, prop), prop, 'Other') as prop" + else: + return ", prop" + else: + return "" diff --git a/ee/clickhouse/queries/funnels/funnel.py b/ee/clickhouse/queries/funnels/funnel.py new file mode 100644 index 0000000000000..c8960e3ef31ea --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel.py @@ -0,0 +1,197 @@ +from typing import List, Tuple, cast + +from ee.clickhouse.queries.breakdown_props import get_breakdown_cohort_name +from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase +from posthog.models.cohort import Cohort + + +class ClickhouseFunnel(ClickhouseFunnelBase): + """ + A basic ordered funnel. + + ## Query Intuition + We start with all events of interest (coming from the `FunnelEventQuery`). The query runs in different levels: at each + level, we first get the minimum timestamp of every event following the previous event. Then, we trickle up the levels, till we get to the top level, + which implies all events are sorted in increasing order. + Each level is a subquery. + + ## Exclusion Intuition + Event exclusion between steps means that if this specific event happened between two funnel steps, we disqualify the user, not showing them in the results. + To include event exclusions inside the funnel, the critical insight is that the exclusion is just like a parallel step to the funnel step that happens after + the exclusion start step. + For example, if we have a funnel with steps [1, 2, 3, 4] and we want to exclude events between step 2 and step 4, then the exclusion step semantics are just + like step 3 semantics. We want to find this event after step 2. + Since it's a parallel step, we don't need to add an extra level, we can reuse the existing levels. + See `get_comparison_cols` and `_get_partition_cols` for how this works. + + Exclusion doesn't support duplicates like: steps [event 1, event 2], and excluding event 1 between steps 1 and 2. + + """ + + def get_query(self): + max_steps = len(self._filter.entities) + + breakdown_clause = self._get_breakdown_prop() + + return f""" + SELECT {self._get_count_columns(max_steps)} {self._get_people_columns(max_steps)} {self._get_step_time_avgs(max_steps)} {self._get_step_time_median(max_steps)} {breakdown_clause} FROM ( + {self.get_step_counts_query()} + ) {'GROUP BY prop' if breakdown_clause != '' else ''} SETTINGS allow_experimental_window_functions = 1 + """ + + def _get_people_columns(self, max_steps: int): + cols: List[str] = [] + + for i in range(max_steps): + cols.append(f"groupArrayIf(100)(DISTINCT aggregation_target, steps = {i + 1}) step_people_{i + 1}") + + formatted = ", ".join(cols) + return f", {formatted}" if formatted else "" + + def get_step_counts_query(self): + steps_per_person_query = self.get_step_counts_without_aggregation_query() + max_steps = len(self._filter.entities) + breakdown_clause = self._get_breakdown_prop() + inner_timestamps, outer_timestamps = self._get_timestamp_selects() + + return f""" + SELECT aggregation_target, steps {self._get_step_time_avgs(max_steps, inner_query=True)} {self._get_step_time_median(max_steps, inner_query=True)} {breakdown_clause} {outer_timestamps} FROM ( + SELECT aggregation_target, steps, max(steps) over (PARTITION BY aggregation_target {breakdown_clause}) as max_steps {self._get_step_time_names(max_steps)} {breakdown_clause} {inner_timestamps} FROM ( + {steps_per_person_query} + ) + ) GROUP BY aggregation_target, steps {breakdown_clause} + HAVING steps = max_steps + SETTINGS allow_experimental_window_functions = 1 + """ + + def _format_results(self, results): + if not results or len(results) == 0: + return [] + + if self._filter.breakdown: + return [self._format_single_funnel(res, with_breakdown=True) for res in results] + else: + return self._format_single_funnel(results[0]) + + def _format_single_funnel(self, result, with_breakdown=False): + # Format of this is [step order, person count (that reached that step), array of person uuids] + steps = [] + relevant_people = [] + total_people = 0 + + num_entities = len(self._filter.entities) + + for step in reversed(self._filter.entities): + + if result and len(result) > 0: + total_people += result[step.order] + relevant_people += result[cast(int, step.order) + num_entities] + + serialized_result = self._serialize_step(step, total_people, relevant_people[0:100]) + if cast(int, step.order) > 0: + serialized_result.update( + { + "average_conversion_time": result[cast(int, step.order) + num_entities * 2 - 1], + "median_conversion_time": result[cast(int, step.order) + num_entities * 3 - 2], + } + ) + else: + serialized_result.update({"average_conversion_time": None, "median_conversion_time": None}) + + if with_breakdown: + # breakdown will return a display ready value + # breakdown_value will return the underlying id if different from display ready value (ex: cohort id) + serialized_result.update( + { + "breakdown": get_breakdown_cohort_name(result[-1]) + if self._filter.breakdown_type == "cohort" + else result[-1], + "breakdown_value": result[-1], + } + ) + # important to not try and modify this value any how - as these are keys for fetching persons + + steps.append(serialized_result) + + return steps[::-1] #  reverse + + def get_step_counts_without_aggregation_query(self): + formatted_query = "" + max_steps = len(self._filter.entities) + if max_steps >= 2: + formatted_query = self.build_step_subquery(2, max_steps) + breakdown_query = self._get_breakdown_prop() + else: + formatted_query = self._get_inner_event_query() + breakdown_query = self._get_breakdown_prop(group_remaining=True) + + exclusion_clause = self._get_exclusion_condition() + + return f""" + SELECT *, {self._get_sorting_condition(max_steps, max_steps)} AS steps {exclusion_clause} {self._get_step_times(max_steps)} {breakdown_query} FROM ( + {formatted_query} + ) WHERE step_0 = 1 + {'AND exclusion = 0' if exclusion_clause else ''} + SETTINGS allow_experimental_window_functions = 1 + """ + + def _get_comparison_at_step(self, index: int, level_index: int): + or_statements: List[str] = [] + + for i in range(level_index, index + 1): + or_statements.append(f"latest_{i} < latest_{level_index - 1}") + + return " OR ".join(or_statements) + + def get_comparison_cols(self, level_index: int, max_steps: int): + """ + level_index: The current smallest comparison step. Everything before + level index is already at the minimum ordered timestamps. + """ + cols: List[str] = [] + for i in range(0, max_steps): + cols.append(f"step_{i}") + if i < level_index: + cols.append(f"latest_{i}") + for exclusion_id, exclusion in enumerate(self._filter.exclusions): + if cast(int, exclusion.funnel_from_step) + 1 == i: + cols.append(f"exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}") + else: + comparison = self._get_comparison_at_step(i, level_index) + cols.append(f"if({comparison}, NULL, latest_{i}) as latest_{i}") + + for exclusion_id, exclusion in enumerate(self._filter.exclusions): + if cast(int, exclusion.funnel_from_step) + 1 == i: + exclusion_identifier = f"exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}" + cols.append( + f"if({exclusion_identifier} < latest_{exclusion.funnel_from_step}, NULL, {exclusion_identifier}) as {exclusion_identifier}" + ) + + return ", ".join(cols) + + def build_step_subquery(self, level_index: int, max_steps: int): + if level_index >= max_steps: + return f""" + SELECT + aggregation_target, + timestamp, + {self._get_partition_cols(1, max_steps)} + {self._get_breakdown_prop(group_remaining=True)} + FROM ({self._get_inner_event_query()}) + """ + else: + return f""" + SELECT + aggregation_target, + timestamp, + {self._get_partition_cols(level_index, max_steps)} + {self._get_breakdown_prop()} + FROM ( + SELECT + aggregation_target, + timestamp, + {self.get_comparison_cols(level_index, max_steps)} + {self._get_breakdown_prop()} + FROM ({self.build_step_subquery(level_index + 1, max_steps)}) + ) + """ diff --git a/ee/clickhouse/queries/funnels/funnel_correlation.py b/ee/clickhouse/queries/funnels/funnel_correlation.py new file mode 100644 index 0000000000000..190f5a96fada7 --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_correlation.py @@ -0,0 +1,640 @@ +import dataclasses +from os import stat +from typing import Any, Dict, List, Literal, Tuple, TypedDict, cast + +from rest_framework.exceptions import ValidationError +from rest_framework.utils.serializer_helpers import ReturnList + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.element import chain_to_elements +from ee.clickhouse.models.event import ElementSerializer +from ee.clickhouse.models.property import get_property_string_expr +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from ee.clickhouse.queries.funnels.funnel_persons import ClickhouseFunnelPersons +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from posthog.constants import AUTOCAPTURE_EVENT, FunnelCorrelationType +from posthog.models import Filter, Team +from posthog.models.filters import Filter + + +class EventDefinition(TypedDict): + event: str + properties: Dict[str, Any] + elements: list + + +class EventOddsRatio(TypedDict): + event: str + + success_count: int + failure_count: int + + odds_ratio: float + correlation_type: Literal["success", "failure"] + + +class EventOddsRatioSerialized(TypedDict): + event: EventDefinition + success_count: int + failure_count: int + odds_ratio: float + correlation_type: Literal["success", "failure"] + + +class FunnelCorrelationResponse(TypedDict): + """ + The structure that the diagnose response will be returned in. + NOTE: TypedDict is used here to comply with existing formats from other + queries, but we could use, for example, a dataclass + """ + + events: List[EventOddsRatioSerialized] + skewed: bool + + +@dataclasses.dataclass +class EventStats: + success_count: int + failure_count: int + + +@dataclasses.dataclass +class EventContingencyTable: + """ + Represents a contingency table for a single event. Note that this isn't a + complete contingency table, but rather only includes totals for + failure/success as opposed to including the number of successes for cases + that a persons _doesn't_ visit an event. + """ + + event: str + visited: EventStats + + success_total: int + failure_total: int + + +class FunnelCorrelation: + + TOTAL_IDENTIFIER = "Total_Values_In_Query" + ELEMENTS_DIVIDER = "__~~__" + AUTOCAPTURE_EVENT_TYPE = "$event_type" + MIN_PERSON_COUNT = 25 + MIN_PERSON_PERCENTAGE = 0.02 + PRIOR_COUNT = 1 + + def __init__(self, filter: Filter, team: Team) -> None: + self._filter = filter + self._team = team + + if self._filter.funnel_step is None: + self._filter = self._filter.with_data({"funnel_step": 1}) + # Funnel Step by default set to 1, to give us all people who entered the funnel + + # Used for generating the funnel persons cte + self._funnel_persons_generator = ClickhouseFunnelPersons( + self._filter, + self._team, + # NOTE: we want to include the latest timestamp of the `target_step`, + # from this we can deduce if the person reached the end of the funnel, + # i.e. successful + include_timestamp=True, + # NOTE: we don't need these as we have all the information we need to + # deduce if the person was successful or not + include_preceding_timestamp=False, + no_person_limit=True, + ) + + def support_autocapture_elements(self) -> bool: + if ( + self._filter.correlation_type == FunnelCorrelationType.EVENT_WITH_PROPERTIES + and AUTOCAPTURE_EVENT in self._filter.correlation_event_names + ): + return True + return False + + def get_contingency_table_query(self) -> Tuple[str, Dict[str, Any]]: + """ + Returns a query string and params, which are used to generate the contingency table. + The query returns success and failure count for event / property values, along with total success and failure counts. + """ + if self._filter.correlation_type == FunnelCorrelationType.PROPERTIES: + return self.get_properties_query() + + if self._filter.correlation_type == FunnelCorrelationType.EVENT_WITH_PROPERTIES: + return self.get_event_property_query() + + return self.get_event_query() + + def get_event_query(self) -> Tuple[str, Dict[str, Any]]: + + funnel_persons_query, funnel_persons_params = self.get_funnel_persons_cte() + + event_join_query = self._get_events_join_query() + + query = f""" + WITH + funnel_people as ({funnel_persons_query}), + toDateTime(%(date_to)s) AS date_to, + toDateTime(%(date_from)s) AS date_from, + %(target_step)s AS target_step, + %(funnel_step_names)s as funnel_step_names + + SELECT + event.event AS name, + + -- If we have a `person.steps = target_step`, we know the person + -- reached the end of the funnel + countDistinctIf( + person.person_id, + person.steps = target_step + ) AS success_count, + + -- And the converse being for failures + countDistinctIf( + person.person_id, + person.steps <> target_step + ) AS failure_count + + FROM events AS event + {event_join_query} + AND event.event NOT IN %(exclude_event_names)s + GROUP BY name + + -- To get the total success/failure numbers, we do an aggregation on + -- the funnel people CTE and count distinct person_ids + UNION ALL + + SELECT + -- We're not using WITH TOTALS because the resulting queries are + -- not runnable in Metabase + '{self.TOTAL_IDENTIFIER}' as name, + + countDistinctIf( + person.person_id, + person.steps = target_step + ) AS success_count, + + countDistinctIf( + person.person_id, + person.steps <> target_step + ) AS failure_count + FROM funnel_people AS person + """ + params = { + **funnel_persons_params, + "funnel_step_names": [entity.id for entity in self._filter.events], + "target_step": len(self._filter.entities), + "exclude_event_names": self._filter.correlation_event_exclude_names, + } + + return query, params + + def get_event_property_query(self) -> Tuple[str, Dict[str, Any]]: + + if not self._filter.correlation_event_names: + raise ValidationError("Event Property Correlation expects atleast one event name to run correlation on") + + funnel_persons_query, funnel_persons_params = self.get_funnel_persons_cte() + + event_join_query = self._get_events_join_query() + + if self.support_autocapture_elements(): + event_type_expression, _ = get_property_string_expr( + "events", self.AUTOCAPTURE_EVENT_TYPE, f"'{self.AUTOCAPTURE_EVENT_TYPE}'", "properties", + ) + array_join_query = f""" + 'elements_chain' as prop_key, + concat({event_type_expression}, '{self.ELEMENTS_DIVIDER}', elements_chain) as prop_value, + tuple(prop_key, prop_value) as prop + """ + else: + array_join_query = f""" + arrayMap(x -> x.1, JSONExtractKeysAndValuesRaw(properties)) as prop_keys, + arrayMap(x -> trim(BOTH '"' FROM JSONExtractRaw(properties, x)), prop_keys) as prop_values, + arrayJoin(arrayZip(prop_keys, prop_values)) as prop + """ + + query = f""" + WITH + funnel_people as ({funnel_persons_query}), + toDateTime(%(date_to)s) AS date_to, + toDateTime(%(date_from)s) AS date_from, + %(target_step)s AS target_step, + %(funnel_step_names)s as funnel_step_names + + SELECT concat(event_name, '::', prop.1, '::', prop.2) as name, + countDistinctIf(person_id, steps = target_step) as success_count, + countDistinctIf(person_id, steps <> target_step) as failure_count + FROM ( + SELECT + person.person_id as person_id, + person.steps as steps, + events.event as event_name, + -- Same as what we do in $all property queries + {array_join_query} + FROM events AS event + {event_join_query} + AND event.event IN %(event_names)s + ) + GROUP BY name + -- Discard high cardinality / low hits properties + -- This removes the long tail of random properties with empty, null, or very small values + HAVING (success_count + failure_count) > 2 + AND prop.1 NOT IN %(exclude_property_names)s + + UNION ALL + -- To get the total success/failure numbers, we do an aggregation on + -- the funnel people CTE and count distinct person_ids + SELECT + '{self.TOTAL_IDENTIFIER}' as name, + + countDistinctIf( + person.person_id, + person.steps = target_step + ) AS success_count, + + countDistinctIf( + person.person_id, + person.steps <> target_step + ) AS failure_count + FROM funnel_people AS person + """ + params = { + **funnel_persons_params, + "funnel_step_names": [entity.id for entity in self._filter.events], + "target_step": len(self._filter.entities), + "event_names": self._filter.correlation_event_names, + "exclude_property_names": self._filter.correlation_event_exclude_property_names, + } + + return query, params + + def get_properties_query(self) -> Tuple[str, Dict[str, Any]]: + + if not self._filter.correlation_property_names: + raise ValidationError("Property Correlation expects atleast one Property to run correlation on") + + funnel_persons_query, funnel_persons_params = self.get_funnel_persons_cte() + + person_prop_query, person_prop_params = self._get_properties_prop_clause() + + person_query, person_query_params = ClickhousePersonQuery( + self._filter, self._team.pk, ColumnOptimizer(self._filter, self._team.pk) + ).get_query() + + query = f""" + WITH + funnel_people as ({funnel_persons_query}), + %(target_step)s AS target_step + SELECT + concat(prop.1, '::', prop.2) as name, + -- We generate a unique identifier for each property value as: PropertyName::Value + countDistinctIf(person_id, steps = target_step) AS success_count, + countDistinctIf(person_id, steps <> target_step) AS failure_count + FROM ( + SELECT + person_id, + funnel_people.steps as steps, + /* + We can extract multiple property values at the same time, since we're + already querying the person table. + This gives us something like: + -------------------- + person1, steps, [property_value_0, property_value_1, property_value_2] + person2, steps, [property_value_0, property_value_1, property_value_2] + + To group by property name, we need to extract the property from the array. ArrayJoin helps us do that. + It transforms the above into: + + -------------------- + + person1, steps, property_value_0 + person1, steps, property_value_1 + person1, steps, property_value_2 + + person2, steps, property_value_0 + person2, steps, property_value_1 + person2, steps, property_value_2 + + To avoid clashes and clarify the values, we also zip with the property name, to generate + tuples like: (property_name, property_value), which we then group by + */ + {person_prop_query} + FROM funnel_people + JOIN ({person_query}) person + ON person.id = funnel_people.person_id + ) person_with_props + -- Group by the tuple items: (property_name, property_value) generated by zip + GROUP BY prop.1, prop.2 + HAVING prop.1 NOT IN %(exclude_property_names)s + UNION ALL + SELECT + '{self.TOTAL_IDENTIFIER}' as name, + countDistinctIf(person_id, steps = target_step) AS success_count, + countDistinctIf(person_id, steps <> target_step) AS failure_count + FROM funnel_people + """ + params = { + **funnel_persons_params, + **person_prop_params, + **person_query_params, + "target_step": len(self._filter.entities), + "property_names": self._filter.correlation_property_names, + "exclude_property_names": self._filter.correlation_property_exclude_names, + } + + return query, params + + def _get_events_join_query(self) -> str: + """ + This query is used to join and filter the events table corresponding to the funnel_people CTE. + It expects the following variables to be present in the CTE expression: + - funnel_people + - date_to + - date_from + - funnel_step_names + """ + + return f""" + JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi + ON pdi.distinct_id = events.distinct_id + + -- NOTE: I would love to right join here, so we count get total + -- success/failure numbers in one pass, but this causes out of memory + -- error mentioning issues with right filling. I'm sure there's a way + -- to do it but lifes too short. + JOIN funnel_people AS person + ON pdi.person_id = person.person_id + + -- Make sure we're only looking at events before the final step, or + -- failing that, date_to + WHERE + -- add this condition in to ensure we can filter events before + -- joining funnel_people + event.timestamp >= date_from + AND event.timestamp < date_to + + AND event.team_id = {self._team.pk} + + -- Add in per person filtering on event time range. We just want + -- to include events that happened within the bounds of the + -- persons time in the funnel. + AND event.timestamp > person.first_timestamp + AND event.timestamp < COALESCE( + person.final_timestamp, + person.first_timestamp + INTERVAL {self._funnel_persons_generator._filter.funnel_window_interval} {self._funnel_persons_generator._filter.funnel_window_interval_unit_ch()}, + date_to) + -- Ensure that the event is not outside the bounds of the funnel conversion window + + -- Exclude funnel steps + AND event.event NOT IN funnel_step_names + """ + + def _get_properties_prop_clause(self): + + if "$all" in cast(list, self._filter.correlation_property_names): + return ( + f""" + arrayMap(x -> x.1, JSONExtractKeysAndValuesRaw({ClickhousePersonQuery.PERSON_PROPERTIES_ALIAS})) as person_prop_keys, + arrayJoin( + arrayZip( + person_prop_keys, + arrayMap(x -> trim(BOTH '"' FROM JSONExtractRaw({ClickhousePersonQuery.PERSON_PROPERTIES_ALIAS}, x)), person_prop_keys) + ) + ) as prop + """, + {}, + ) + else: + person_property_expressions = [] + person_property_params = {} + for index, property_name in enumerate(cast(list, self._filter.correlation_property_names)): + param_name = f"property_name_{index}" + expression, _ = get_property_string_expr( + "person", property_name, f"%({param_name})s", ClickhousePersonQuery.PERSON_PROPERTIES_ALIAS, + ) + person_property_params[param_name] = property_name + person_property_expressions.append(expression) + + return ( + f""" + arrayJoin(arrayZip( + %(property_names)s, + [{','.join(person_property_expressions)}] + )) as prop + """, + person_property_params, + ) + + def _run(self) -> Tuple[List[EventOddsRatio], bool]: + """ + Run the diagnose query. + + Funnel Correlation queries take as input the same as the funnel query, + and returns the correlation of person events with a person successfully + getting to the end of the funnel. We use Odds Ratios as the correlation + metric. See https://en.wikipedia.org/wiki/Odds_ratio for more details. + + Roughly speaking, to calculate the odds ratio, we build a contingency + table https://en.wikipedia.org/wiki/Contingency_table for each + dimension, then calculate the odds ratio for each. + + For example, take for simplicity the cohort of all people, and the + success criteria of having a "signed up" event. First we would build a + contingency table like: + + | | success | failure | total | + | -----------------: | :-----: | :-----: | :---: | + | watched video | 5 | 1 | 6 | + | didn't watch video | 2 | 10 | 12 | + + + Then the odds that a person signs up given they watched the video is 5 / + 1. + + And the odds that a person signs up given they didn't watch the video is + 2 / 10. + + So we say the odds ratio is 5 / 1 over 2 / 10 = 25 . The further away the + odds ratio is from 1, the greater the correlation. + + Requirements: + + - Intitially we only need to consider the names of events that a cohort + person has emitted. So we explicitly are not interested in e.g. + correlating properties, although this will be a follow-up. + + Non-functional requirements: + + - there can be perhaps millions of people in a cohort, so we should + consider this when writing the algorithm. e.g. we should probably + avoid pulling all people into across the wire. + - there can be an order of magnitude more events than people, so we + should avoid pulling all events across the wire. + - there may be a large but not huge number of distinct events, let's say + 100 different names for events. We should avoid n+1 queries for the + event names dimension + + Contincency tables are something we can pull out of the db, so we can + have a query that: + + 1. filters people by the cohort criteria + 2. groups these people by the success criteria + 3. groups people by our criterion with which we want to test + correlation, e.g. "watched video" + + """ + + event_contingency_tables, success_total, failure_total = self.get_partial_event_contingency_tables() + + if not success_total or not failure_total: + return [], True + + skewed_totals = False + + # If the ratio is greater than 1:10, then we have a skewed result, so we should + # warn the user. + if success_total / failure_total > 10 or failure_total / success_total > 10: + skewed_totals = True + + odds_ratios = [ + get_entity_odds_ratio(event_stats, FunnelCorrelation.PRIOR_COUNT) + for event_stats in event_contingency_tables + if not FunnelCorrelation.are_results_insignificant(event_stats) + ] + + positively_correlated_events = sorted( + [odds_ratio for odds_ratio in odds_ratios if odds_ratio["correlation_type"] == "success"], + key=lambda x: x["odds_ratio"], + reverse=True, + ) + + negatively_correlated_events = sorted( + [odds_ratio for odds_ratio in odds_ratios if odds_ratio["correlation_type"] == "failure"], + key=lambda x: x["odds_ratio"], + reverse=False, + ) + + # Return the top ten positively correlated events, and top then negatively correlated events + events = positively_correlated_events[:10] + negatively_correlated_events[:10] + return events, skewed_totals + + def format_results(self, results: Tuple[List[EventOddsRatio], bool]) -> FunnelCorrelationResponse: + return { + "events": [ + { + "success_count": odds_ratio["success_count"], + "failure_count": odds_ratio["failure_count"], + "odds_ratio": odds_ratio["odds_ratio"], + "correlation_type": odds_ratio["correlation_type"], + "event": self.serialize_event_with_property(odds_ratio["event"]), + } + for odds_ratio in results[0] + ], + "skewed": results[1], + } + + def run(self) -> FunnelCorrelationResponse: + if not self._filter.entities: + return FunnelCorrelationResponse(events=[], skewed=False) + + return self.format_results(self._run()) + + def get_partial_event_contingency_tables(self) -> Tuple[List[EventContingencyTable], int, int]: + """ + For each event a person that started going through the funnel, gets stats + for how many of these users are sucessful and how many are unsuccessful. + + It's a partial table as it doesn't include numbers of the negation of the + event, but does include the total success/failure numbers, which is enough + for us to calculate the odds ratio. + """ + + query, params = self.get_contingency_table_query() + results_with_total = sync_execute(query, params) + + # Get the total success/failure counts from the results + results = [result for result in results_with_total if result[0] != self.TOTAL_IDENTIFIER] + _, success_total, failure_total = [ + result for result in results_with_total if result[0] == self.TOTAL_IDENTIFIER + ][0] + + # Add a little structure, and keep it close to the query definition so it's + # obvious what's going on with result indices. + return ( + [ + EventContingencyTable( + event=result[0], + visited=EventStats(success_count=result[1], failure_count=result[2]), + success_total=success_total, + failure_total=failure_total, + ) + for result in results + ], + success_total, + failure_total, + ) + + def get_funnel_persons_cte(self) -> Tuple[str, Dict[str, Any]]: + + return ( + self._funnel_persons_generator.get_query(extra_fields=["steps", "final_timestamp", "first_timestamp"]), + self._funnel_persons_generator.params, + ) + + @staticmethod + def are_results_insignificant(event_contingency_table: EventContingencyTable) -> bool: + """ + Check if the results are insignificant, i.e. if the success/failure counts are + significantly different from the total counts + """ + + total_count = event_contingency_table.success_total + event_contingency_table.failure_total + + if event_contingency_table.visited.success_count + event_contingency_table.visited.failure_count < min( + FunnelCorrelation.MIN_PERSON_COUNT, FunnelCorrelation.MIN_PERSON_PERCENTAGE * total_count + ): + return True + + return False + + def serialize_event_with_property(self, event: str) -> EventDefinition: + """ + Format the event name for display. + """ + if not self.support_autocapture_elements(): + return EventDefinition(event=event, properties={}, elements=[]) + + event_name, property_name, property_value = event.split("::") + if event_name == AUTOCAPTURE_EVENT and property_name == "elements_chain": + + event_type, elements_chain = property_value.split(self.ELEMENTS_DIVIDER) + return EventDefinition( + event=event, + properties={self.AUTOCAPTURE_EVENT_TYPE: event_type}, + elements=cast(list, ElementSerializer(chain_to_elements(elements_chain), many=True).data), + ) + + return EventDefinition(event=event, properties={}, elements=[]) + + +def get_entity_odds_ratio(event_contingency_table: EventContingencyTable, prior_counts: int) -> EventOddsRatio: + + # Add 1 to all values to prevent divide by zero errors, and introduce a [prior](https://en.wikipedia.org/wiki/Prior_probability) + odds_ratio = ( + (event_contingency_table.visited.success_count + prior_counts) + * (event_contingency_table.failure_total - event_contingency_table.visited.failure_count + prior_counts) + ) / ( + (event_contingency_table.success_total - event_contingency_table.visited.success_count + prior_counts) + * (event_contingency_table.visited.failure_count + prior_counts) + ) + + return EventOddsRatio( + event=event_contingency_table.event, + success_count=event_contingency_table.visited.success_count, + failure_count=event_contingency_table.visited.failure_count, + odds_ratio=odds_ratio, + correlation_type="success" if odds_ratio > 1 else "failure", + ) diff --git a/ee/clickhouse/queries/funnels/funnel_correlation_persons.py b/ee/clickhouse/queries/funnels/funnel_correlation_persons.py new file mode 100644 index 0000000000000..cae927ed554d2 --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_correlation_persons.py @@ -0,0 +1,94 @@ +from typing import Any, Dict, List, Optional, Tuple, cast + +from rest_framework.exceptions import ValidationError +from rest_framework.utils.serializer_helpers import ReturnDict, ReturnList + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.property import get_property_string_expr +from ee.clickhouse.queries.funnels.funnel_correlation import FunnelCorrelation +from ee.clickhouse.queries.funnels.funnel_event_query import FunnelEventQuery +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from posthog.constants import FUNNEL_CORRELATION_PERSON_LIMIT +from posthog.models import Person +from posthog.models.entity import Entity +from posthog.models.filters.filter import Filter +from posthog.models.team import Team + + +class FunnelCorrelationPersons: + def __init__(self, filter: Filter, team: Team) -> None: + self._funnel_correlation = FunnelCorrelation(filter, team) + self._filter = filter + self._team = team + + if not self._filter.correlation_person_limit: + self._filter = self._filter.with_data({FUNNEL_CORRELATION_PERSON_LIMIT: 100}) + + def run(self): + """ + Returns `ReturnList` type, generated by `serializers.serialize`, which returns the Person model. + """ + + if not self._filter.correlation_person_entity: + raise ValidationError("No entity for persons specified") + + query, params = self.get_query() + results: List[Tuple[str]] = sync_execute(query, params) + + return self._format_results(results) + + def get_query(self) -> Tuple[str, Dict[str, Any]]: + + assert isinstance(self._filter.correlation_person_entity, Entity) + + funnel_persons_query, funnel_persons_params = self._funnel_correlation.get_funnel_persons_cte() + + prop_filters = self._filter.correlation_person_entity.properties + prop_query, prop_params = FunnelEventQuery(self._filter, self._team.pk)._get_props(prop_filters) + + conversion_filter = ( + f'AND person.steps {"=" if self._filter.correlation_persons_converted else "<>"} target_step' + if self._filter.correlation_persons_converted is not None + else "" + ) + + event_join_query = self._funnel_correlation._get_events_join_query() + + query = f""" + WITH + funnel_people as ({funnel_persons_query}), + toDateTime(%(date_to)s) AS date_to, + toDateTime(%(date_from)s) AS date_from, + %(target_step)s AS target_step, + %(funnel_step_names)s as funnel_step_names + SELECT + DISTINCT person.person_id as person_id + FROM events AS event + {event_join_query} + AND event.event = %(target_event)s + {conversion_filter} + {prop_query} + ORDER BY person_id + LIMIT {self._filter.correlation_person_limit} + OFFSET {self._filter.correlation_person_offset} + """ + + params = { + **funnel_persons_params, + **prop_params, + "target_event": self._filter.correlation_person_entity.id, + "funnel_step_names": [entity.id for entity in self._filter.events], + "target_step": len(self._filter.entities), + } + + return query, params + + def _format_results(self, results: List[Tuple[str]]): + people = Person.objects.filter(team_id=self._team.pk, uuid__in=[val[0] for val in results]) + + from posthog.api.person import PersonSerializer + + return ( + PersonSerializer(people, many=True).data, + len(results) > cast(int, self._filter.correlation_person_limit) - 1, + ) diff --git a/ee/clickhouse/queries/funnels/funnel_event_query.py b/ee/clickhouse/queries/funnels/funnel_event_query.py new file mode 100644 index 0000000000000..be96c7d3809fb --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_event_query.py @@ -0,0 +1,89 @@ +from typing import Any, Dict, Tuple + +from ee.clickhouse.models.group import get_aggregation_target_field +from ee.clickhouse.queries.event_query import ClickhouseEventQuery +from posthog.constants import TREND_FILTER_TYPE_ACTIONS + + +class FunnelEventQuery(ClickhouseEventQuery): + def get_query(self, entities=None, entity_name="events", skip_entity_filter=False) -> Tuple[str, Dict[str, Any]]: + _fields = [ + f"{self.EVENT_TABLE_ALIAS}.event as event", + f"{self.EVENT_TABLE_ALIAS}.team_id as team_id", + f"{self.EVENT_TABLE_ALIAS}.distinct_id as distinct_id", + f"{self.EVENT_TABLE_ALIAS}.timestamp as timestamp", + ( + f"{self.EVENT_TABLE_ALIAS}.elements_chain as elements_chain" + if self._column_optimizer.should_query_elements_chain_column + else "" + ), + f"{get_aggregation_target_field(self._filter.aggregation_group_type_index, self.EVENT_TABLE_ALIAS, self.DISTINCT_ID_TABLE_ALIAS)} as aggregation_target", + ] + + _fields.extend( + f"{self.EVENT_TABLE_ALIAS}.{column_name} as {column_name}" + for column_name in self._column_optimizer.event_columns_to_query + ) + + _fields.extend( + f"groups_{group_index}.group_properties_{group_index} as group_properties_{group_index}" + for group_index in self._column_optimizer.group_types_to_query + ) + + if self._should_join_persons: + _fields.extend( + f"{self.PERSON_TABLE_ALIAS}.{column_name} as {column_name}" for column_name in self._person_query.fields + ) + + _fields = list(filter(None, _fields)) + + date_query, date_params = self._get_date_filter() + self.params.update(date_params) + + prop_filters = self._filter.properties + prop_query, prop_params = self._get_props(prop_filters) + self.params.update(prop_params) + + if skip_entity_filter: + entity_query = "" + entity_params: Dict[str, Any] = {} + else: + entity_query, entity_params = self._get_entity_query(entities, entity_name) + + self.params.update(entity_params) + + person_query, person_params = self._get_person_query() + self.params.update(person_params) + + groups_query, groups_params = self._get_groups_query() + self.params.update(groups_params) + + query = f""" + SELECT {', '.join(_fields)} FROM events {self.EVENT_TABLE_ALIAS} + {self._get_disintct_id_query()} + {person_query} + {groups_query} + WHERE team_id = %(team_id)s + {entity_query} + {date_query} + {prop_query} + """ + + return query, self.params + + def _determine_should_join_distinct_ids(self) -> None: + self._should_join_distinct_ids = True + + def _get_entity_query(self, entities=None, entity_name="events") -> Tuple[str, Dict[str, Any]]: + events = set() + entities_to_use = entities or self._filter.entities + + for entity in entities_to_use: + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = entity.get_action() + for action_step in action.steps.all(): + events.add(action_step.event) + else: + events.add(entity.id) + + return f"AND event IN %({entity_name})s", {entity_name: sorted(list(events))} diff --git a/ee/clickhouse/queries/funnels/funnel_persons.py b/ee/clickhouse/queries/funnels/funnel_persons.py new file mode 100644 index 0000000000000..ef3d67b63d36f --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_persons.py @@ -0,0 +1,26 @@ +from typing import Any, Dict, List, Optional, cast + +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel +from ee.clickhouse.sql.funnels.funnel import FUNNEL_PERSONS_BY_STEP_SQL +from posthog.models import Person +from posthog.models.filters.filter import Filter +from posthog.models.team import Team + + +class ClickhouseFunnelPersons(ClickhouseFunnel): + def get_query(self, extra_fields: Optional[List[str]] = None): + extra_fields_string = ", ".join([self._get_timestamp_outer_select()] + (extra_fields or [])) + return FUNNEL_PERSONS_BY_STEP_SQL.format( + offset=self._filter.offset, + steps_per_person_query=self.get_step_counts_query(), + persons_steps=self._get_funnel_person_step_condition(), + extra_fields=extra_fields_string, + limit="" if self._no_person_limit else "LIMIT %(limit)s", + ) + + def _format_results(self, results): + people = Person.objects.filter(team_id=self._team.pk, uuid__in=[val[0] for val in results]) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data, len(results) > cast(int, self._filter.limit) - 1 diff --git a/ee/clickhouse/queries/funnels/funnel_strict.py b/ee/clickhouse/queries/funnels/funnel_strict.py new file mode 100644 index 0000000000000..ed749c1857ca0 --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_strict.py @@ -0,0 +1,67 @@ +from typing import List + +from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase + + +class ClickhouseFunnelStrict(ClickhouseFunnelBase): + def get_query(self): + max_steps = len(self._filter.entities) + + breakdown_clause = self._get_breakdown_prop() + + return f""" + SELECT {self._get_count_columns(max_steps)} {self._get_step_time_avgs(max_steps)} {self._get_step_time_median(max_steps)} {breakdown_clause} FROM ( + {self.get_step_counts_query()} + ) {'GROUP BY prop' if breakdown_clause != '' else ''} SETTINGS allow_experimental_window_functions = 1 + """ + + def get_step_counts_query(self): + + steps_per_person_query = self.get_step_counts_without_aggregation_query() + max_steps = len(self._filter.entities) + breakdown_clause = self._get_breakdown_prop() + inner_timestamps, outer_timestamps = self._get_timestamp_selects() + + return f""" + SELECT aggregation_target, steps {self._get_step_time_avgs(max_steps, inner_query=True)} {self._get_step_time_median(max_steps, inner_query=True)} {breakdown_clause} {outer_timestamps} FROM ( + SELECT aggregation_target, steps, max(steps) over (PARTITION BY aggregation_target {breakdown_clause}) as max_steps {self._get_step_time_names(max_steps)} {breakdown_clause} {inner_timestamps} FROM ( + {steps_per_person_query} + ) + ) GROUP BY aggregation_target, steps {breakdown_clause} + HAVING steps = max_steps + """ + + def get_step_counts_without_aggregation_query(self): + max_steps = len(self._filter.entities) + + partition_select = self._get_partition_cols(1, max_steps) + sorting_condition = self._get_sorting_condition(max_steps, max_steps) + breakdown_clause = self._get_breakdown_prop(group_remaining=True) + + inner_query = f""" + SELECT + aggregation_target, + timestamp, + {partition_select} + {breakdown_clause} + FROM ({self._get_inner_event_query(skip_entity_filter=True, skip_step_filter=True)}) + """ + + formatted_query = f""" + SELECT *, {sorting_condition} AS steps {self._get_step_times(max_steps)} FROM ( + {inner_query} + ) WHERE step_0 = 1""" + + return formatted_query + + def _get_partition_cols(self, level_index: int, max_steps: int): + cols: List[str] = [] + for i in range(0, max_steps): + cols.append(f"step_{i}") + if i < level_index: + cols.append(f"latest_{i}") + else: + cols.append( + f"min(latest_{i}) over (PARTITION by aggregation_target {self._get_breakdown_prop()} ORDER BY timestamp DESC ROWS BETWEEN {i} PRECEDING AND {i} PRECEDING) latest_{i}" + ) + return ", ".join(cols) diff --git a/ee/clickhouse/queries/funnels/funnel_strict_persons.py b/ee/clickhouse/queries/funnels/funnel_strict_persons.py new file mode 100644 index 0000000000000..582a4a874d91a --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_strict_persons.py @@ -0,0 +1,23 @@ +from typing import cast + +from ee.clickhouse.queries.funnels.funnel_strict import ClickhouseFunnelStrict +from ee.clickhouse.sql.funnels.funnel import FUNNEL_PERSONS_BY_STEP_SQL +from posthog.models import Person + + +class ClickhouseFunnelStrictPersons(ClickhouseFunnelStrict): + def get_query(self): + return FUNNEL_PERSONS_BY_STEP_SQL.format( + offset=self._filter.offset, + steps_per_person_query=self.get_step_counts_query(), + persons_steps=self._get_funnel_person_step_condition(), + extra_fields=self._get_timestamp_outer_select(), + limit="" if self._no_person_limit else "LIMIT %(limit)s", + ) + + def _format_results(self, results): + people = Person.objects.filter(team_id=self._team.pk, uuid__in=[val[0] for val in results]) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data, len(results) > cast(int, self._filter.limit) - 1 diff --git a/ee/clickhouse/queries/funnels/funnel_time_to_convert.py b/ee/clickhouse/queries/funnels/funnel_time_to_convert.py new file mode 100644 index 0000000000000..4e2a03110b4c6 --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_time_to_convert.py @@ -0,0 +1,121 @@ +from typing import Type + +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel +from posthog.constants import FUNNEL_TO_STEP +from posthog.models.filters.filter import Filter +from posthog.models.team import Team + + +class ClickhouseFunnelTimeToConvert(ClickhouseFunnelBase): + def __init__( + self, filter: Filter, team: Team, funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel + ) -> None: + super().__init__(filter, team) + self.funnel_order = funnel_order_class(filter, team) + + def _format_results(self, results: list) -> dict: + return { + "bins": [(bin_from_seconds, person_count) for bin_from_seconds, person_count, _ in results], + "average_conversion_time": results[0][2], + } + + def get_query(self) -> str: + steps_per_person_query = self.funnel_order.get_step_counts_query() + self.params.update(self.funnel_order.params) + # expects 1 person per row, whatever their max step is, and the step conversion times for this person + + # Conversion from which step should be calculated + from_step = self._filter.funnel_from_step or 0 + # Conversion to which step should be calculated + to_step = self._filter.funnel_to_step or len(self._filter.entities) - 1 + + # Use custom bin_count if provided by user, otherwise infer an automatic one based on the number of samples + bin_count = self._filter.bin_count + if bin_count is not None: + # Custom count is clamped between 1 and 90 + if bin_count < 1: + bin_count = 1 + elif bin_count > 90: + bin_count = 90 + bin_count_identifier = str(bin_count) + bin_count_expression = None + else: + # Auto count is clamped between 3 and 60 + bin_count_identifier = "bin_count" + bin_count_expression = f""" + count() AS sample_count, + least(60, greatest(3, ceil(cbrt(sample_count)))) AS {bin_count_identifier}, + """ + + if not (0 < to_step < len(self._filter.entities)): + raise ValidationError( + f'Filter parameter {FUNNEL_TO_STEP} can only be one of {", ".join(map(str, range(1, len(self._filter.entities))))} for time to convert!' + ) + + steps_average_conversion_time_identifiers = [ + f"step_{step+1}_average_conversion_time_inner" for step in range(from_step, to_step) + ] + steps_average_conversion_time_expression_sum = " + ".join(steps_average_conversion_time_identifiers) + + steps_average_conditional_for_invalid_values = [ + f"{identifier} >= 0" for identifier in steps_average_conversion_time_identifiers + ] + # :HACK: Protect against CH bug https://github.com/ClickHouse/ClickHouse/issues/26580 + # once the issue is resolved, stop skipping the test: test_auto_bin_count_single_step_duplicate_events + # and remove this comment + + query = f""" + WITH + step_runs AS ( + SELECT * FROM ( + {steps_per_person_query} + ) WHERE {" AND ".join(steps_average_conditional_for_invalid_values)} + ), + histogram_params AS ( + /* Binning ensures that each sample belongs to a bin in results */ + /* If bin_count is not a custom number, it's calculated in bin_count_expression */ + SELECT + floor(min({steps_average_conversion_time_expression_sum})) AS from_seconds, + ceil(max({steps_average_conversion_time_expression_sum})) AS to_seconds, + round(avg({steps_average_conversion_time_expression_sum}), 2) AS average_conversion_time, + {bin_count_expression or ""} + ceil((to_seconds - from_seconds) / {bin_count_identifier}) AS bin_width_seconds_raw, + /* Use 60 seconds as fallback bin width in case of only one sample */ + if(bin_width_seconds_raw > 0, bin_width_seconds_raw, 60) AS bin_width_seconds + FROM step_runs + ), + /* Below CTEs make histogram_params columns available to the query below as straightforward identifiers */ + ( SELECT bin_width_seconds FROM histogram_params ) AS bin_width_seconds, + /* bin_count is only made available as an identifier if it had to be calculated */ + { + f"( SELECT {bin_count_identifier} FROM histogram_params ) AS {bin_count_identifier}," + if bin_count_expression else "" + } + ( SELECT from_seconds FROM histogram_params ) AS histogram_from_seconds, + ( SELECT to_seconds FROM histogram_params ) AS histogram_to_seconds, + ( SELECT average_conversion_time FROM histogram_params ) AS histogram_average_conversion_time + SELECT + bin_from_seconds, + person_count, + histogram_average_conversion_time AS average_conversion_time + FROM ( + /* Calculating bins from step runs */ + SELECT + histogram_from_seconds + floor(({steps_average_conversion_time_expression_sum} - histogram_from_seconds) / bin_width_seconds) * bin_width_seconds AS bin_from_seconds, + count() AS person_count + FROM step_runs + GROUP BY bin_from_seconds + ) results + RIGHT OUTER JOIN ( + /* Making sure bin_count bins are returned */ + /* Those not present in the results query due to lack of data simply get person_count 0 */ + SELECT histogram_from_seconds + number * bin_width_seconds AS bin_from_seconds FROM system.numbers LIMIT {bin_count_identifier} + 1 + ) fill + USING (bin_from_seconds) + ORDER BY bin_from_seconds + SETTINGS allow_experimental_window_functions = 1""" + + return query diff --git a/ee/clickhouse/queries/funnels/funnel_trends.py b/ee/clickhouse/queries/funnels/funnel_trends.py new file mode 100644 index 0000000000000..e6c5cc38df08a --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_trends.py @@ -0,0 +1,232 @@ +from datetime import date, datetime +from itertools import groupby +from typing import Optional, Tuple, Type, Union, cast + +from dateutil.relativedelta import relativedelta + +from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel +from ee.clickhouse.queries.util import ( + format_ch_timestamp, + get_earliest_timestamp, + get_interval_func_ch, + get_trunc_func_ch, +) +from posthog.models.cohort import Cohort +from posthog.models.filters.filter import Filter +from posthog.models.team import Team + +TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S" +HUMAN_READABLE_TIMESTAMP_FORMAT = "%-d-%b-%Y" + + +class ClickhouseFunnelTrends(ClickhouseFunnelBase): + """ + ## Funnel trends assumptions + + Funnel trends are a graph of conversion over time – meaning a Y ({conversion_rate}) for each X ({entrance_period}). + + ### What is {entrance_period}? + + A funnel is considered entered by a user when they have performed its first step. + When that happens, we consider that an entrance of funnel. + + Now, our time series is based on a sequence of {entrance_period}s, each starting at {entrance_period_start} + and ending _right before the next_ {entrance_period_start}. A person is then counted at most once in each + {entrance_period}. + + ### What is {conversion_rate}? + + Each time a funnel is entered by a person, they have exactly {funnel_window_interval} {funnel_window_interval_unit} to go + through the funnel's steps. Later events are just not taken into account. + + For {conversion_rate}, we need to know reference steps: {from_step} and {to_step}. + By default they are respectively the first and the last steps of the funnel. + + Then for each {entrance_period} we calculate {reached_from_step_count} – the number of persons + who entered the funnel and reached step {from_step} (along with all the steps leading up to it, if there any). + Similarly we calculate {reached_to_step_count}, which is the number of persons from {reached_from_step_count} + who also reached step {to_step} (along with all the steps leading up to it, including of course step {from_step}). + + {conversion_rate} is simply {reached_to_step_count} divided by {reached_from_step_count}, + multiplied by 100 to be a percentage. + + If no people have reached step {from_step} in the period, {conversion_rate} is zero. + """ + + def __init__( + self, filter: Filter, team: Team, funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel + ) -> None: + + super().__init__(filter, team) + + self.funnel_order = funnel_order_class(filter, team) + + def _exec_query(self): + return self._summarize_data(super()._exec_query()) + + def get_step_counts_without_aggregation_query( + self, *, specific_entrance_period_start: Optional[datetime] = None + ) -> str: + steps_per_person_query = self.funnel_order.get_step_counts_without_aggregation_query() + + trunc_func = get_trunc_func_ch(self._filter.interval) + + # This is used by funnel trends when we only need data for one period, e.g. person per data point + if specific_entrance_period_start: + self.params["entrance_period_start"] = specific_entrance_period_start.strftime(TIMESTAMP_FORMAT) + + breakdown_clause = self._get_breakdown_prop() + return f""" + SELECT + aggregation_target, + {trunc_func}(timestamp) AS entrance_period_start, + max(steps) AS steps_completed + {breakdown_clause} + FROM ( + {steps_per_person_query} + ) + {"WHERE toDateTime(entrance_period_start) = %(entrance_period_start)s" if specific_entrance_period_start else ""} + GROUP BY aggregation_target, entrance_period_start {breakdown_clause}""" + + def get_query(self) -> str: + step_counts = self.get_step_counts_without_aggregation_query() + # Expects multiple rows for same person, first event time, steps taken. + self.params.update(self.funnel_order.params) + + reached_from_step_count_condition, reached_to_step_count_condition, _ = self.get_steps_reached_conditions() + trunc_func = get_trunc_func_ch(self._filter.interval) + interval_func = get_interval_func_ch(self._filter.interval) + + if self._filter.date_from is None: + _date_from = get_earliest_timestamp(self._team.pk) + else: + _date_from = self._filter.date_from + + breakdown_clause = self._get_breakdown_prop() + formatted_date_from = format_ch_timestamp(_date_from, self._filter) + formatted_date_to = format_ch_timestamp(self._filter.date_to, self._filter) + + self.params.update( + { + "formatted_date_from": formatted_date_from, + "formatted_date_to": formatted_date_to, + "interval": self._filter.interval, + } + ) + + query = f""" + SELECT + entrance_period_start, + reached_from_step_count, + reached_to_step_count, + if(reached_from_step_count > 0, round(reached_to_step_count / reached_from_step_count * 100, 2), 0) AS conversion_rate + {breakdown_clause} + FROM ( + SELECT + entrance_period_start, + countIf({reached_from_step_count_condition}) AS reached_from_step_count, + countIf({reached_to_step_count_condition}) AS reached_to_step_count + {breakdown_clause} + FROM ( + {step_counts} + ) GROUP BY entrance_period_start {breakdown_clause} + ) data + RIGHT OUTER JOIN ( + SELECT + {trunc_func}(toDateTime(%(formatted_date_from)s) + {interval_func}(number)) AS entrance_period_start + {', breakdown_value as prop' if breakdown_clause else ''} + FROM numbers(dateDiff(%(interval)s, toDateTime(%(formatted_date_from)s), toDateTime(%(formatted_date_to)s)) + 1) AS period_offsets + {'ARRAY JOIN (%(breakdown_values)s) AS breakdown_value' if breakdown_clause else ''} + ) fill + USING (entrance_period_start {breakdown_clause}) + ORDER BY entrance_period_start ASC + SETTINGS allow_experimental_window_functions = 1""" + + return query + + def get_steps_reached_conditions(self) -> Tuple[str, str, str]: + # How many steps must have been done to count for the denominator of a funnel trends data point + from_step = self._filter.funnel_from_step or 0 + # How many steps must have been done to count for the numerator of a funnel trends data point + to_step = self._filter.funnel_to_step or len(self._filter.entities) - 1 + + # Those who converted OR dropped off + reached_from_step_count_condition = f"steps_completed >= {from_step+1}" + # Those who converted + reached_to_step_count_condition = f"steps_completed >= {to_step+1}" + # Those who dropped off + did_not_reach_to_step_count_condition = f"{reached_from_step_count_condition} AND steps_completed < {to_step+1}" + return reached_from_step_count_condition, reached_to_step_count_condition, did_not_reach_to_step_count_condition + + def _summarize_data(self, results): + + breakdown_clause = self._get_breakdown_prop() + + summary = [] + for period_row in results: + serialized_result = { + "timestamp": period_row[0], + "reached_from_step_count": period_row[1], + "reached_to_step_count": period_row[2], + "conversion_rate": period_row[3], + "is_period_final": self._is_period_final(period_row[0]), + } + + if breakdown_clause: + serialized_result.update( + { + "breakdown_value": period_row[-1] + if isinstance(period_row[-1], str) + else Cohort.objects.get(pk=period_row[-1]).name + } + ) + + summary.append(serialized_result) + return summary + + def _format_results(self, summary): + + if self._filter.breakdown: + grouper = lambda row: row["breakdown_value"] + sorted_data = sorted(summary, key=grouper) + final_res = [] + for key, value in groupby(sorted_data, grouper): + breakdown_res = self._format_single_summary(list(value)) + final_res.append({**breakdown_res, "breakdown_value": key}) + return final_res + else: + res = self._format_single_summary(summary) + + return [res] + + def _format_single_summary(self, summary): + count = len(summary) + data = [] + days = [] + labels = [] + for row in summary: + timestamp: datetime = row["timestamp"] + data.append(row["conversion_rate"]) + hour_min_sec = " %H:%M:%S" if self._filter.interval == "hour" or self._filter.interval == "minute" else "" + days.append(timestamp.strftime(f"%Y-%m-%d{hour_min_sec}")) + labels.append(timestamp.strftime(HUMAN_READABLE_TIMESTAMP_FORMAT)) + return { + "count": count, + "data": data, + "days": days, + "labels": labels, + } + + def _is_period_final(self, timestamp: Union[datetime, date]): + # difference between current date and timestamp greater than window + now = datetime.utcnow().date() + intervals_to_subtract = cast(int, self._filter.funnel_window_interval) * -1 + interval_unit = ( + "day" if self._filter.funnel_window_interval_unit is None else self._filter.funnel_window_interval_unit + ) + delta = relativedelta(**{f"{interval_unit}s": intervals_to_subtract}) # type: ignore + completed_end = now + delta + compare_timestamp = timestamp.date() if isinstance(timestamp, datetime) else timestamp + is_final = compare_timestamp <= completed_end + return is_final diff --git a/ee/clickhouse/queries/funnels/funnel_trends_persons.py b/ee/clickhouse/queries/funnels/funnel_trends_persons.py new file mode 100644 index 0000000000000..03eecb09feadd --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_trends_persons.py @@ -0,0 +1,48 @@ +from typing import cast + +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.queries.funnels.funnel_trends import TIMESTAMP_FORMAT, ClickhouseFunnelTrends +from ee.clickhouse.queries.util import get_trunc_func_ch +from ee.clickhouse.sql.funnels.funnel import FUNNEL_PERSONS_BY_STEP_SQL +from posthog.constants import DROP_OFF, ENTRANCE_PERIOD_START +from posthog.models.person import Person + + +class ClickhouseFunnelTrendsPersons(ClickhouseFunnelTrends): + def get_query(self) -> str: + drop_off = self._filter.drop_off + if drop_off is None: + raise ValidationError(f"Filter parameter {DROP_OFF} must be provided and a bool for funnel trends persons!") + + entrance_period_start = self._filter.entrance_period_start + if not entrance_period_start: + raise ValidationError( + f"Filter parameter {ENTRANCE_PERIOD_START} must be provided and a datetime for funnel trends persons!" + ) + + step_counts_query = self.get_step_counts_without_aggregation_query( + specific_entrance_period_start=entrance_period_start + ) + # Expects multiple rows for same person, first event time, steps taken. + self.params.update(self.funnel_order.params) + + _, reached_to_step_count_condition, did_not_reach_to_step_count_condition = self.get_steps_reached_conditions() + + return FUNNEL_PERSONS_BY_STEP_SQL.format( + offset=self._filter.offset, + steps_per_person_query=step_counts_query, + persons_steps=did_not_reach_to_step_count_condition if drop_off else reached_to_step_count_condition, + extra_fields="", + limit="" if self._no_person_limit else "LIMIT %(limit)s", + ) + + def _summarize_data(self, results): + return results + + def _format_results(self, results): + people = Person.objects.filter(team_id=self._team.pk, uuid__in=[val[0] for val in results]) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data, len(results) > cast(int, self._filter.limit) - 1 diff --git a/ee/clickhouse/queries/funnels/funnel_unordered.py b/ee/clickhouse/queries/funnels/funnel_unordered.py new file mode 100644 index 0000000000000..8ff87ef3d07e4 --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_unordered.py @@ -0,0 +1,160 @@ +from typing import List, cast + +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.queries.funnels.base import ClickhouseFunnelBase + + +class ClickhouseFunnelUnordered(ClickhouseFunnelBase): + """ + Unordered Funnel is a funnel where the order of steps doesn't matter. + + ## Query Intuition + + Imagine a funnel with three events: A, B, and C. + This query splits the problem into two parts: + 1. Given the first event is A, find the furthest everyone went starting from A. + This finds any B's and C's that happen after A (without ordering them) + 2. Repeat the above, assuming first event to be B, and then C. + + Then, the outer query unions the result of (2) and takes the maximum of these. + + ## Results + + The result format is the same as the basic funnel, i.e. [step, count]. + Here, `step_i` (0 indexed) signifies the number of people that did at least `i+1` steps. + + ## Exclusion Semantics + For unordered funnels, exclusion is a bit weird. It means, given all ordering of the steps, + how far can you go without seeing an exclusion event. + If you see an exclusion event => you're discarded. + See test_advanced_funnel_multiple_exclusions_between_steps for details. + """ + + def get_query(self): + + max_steps = len(self._filter.entities) + + for exclusion in self._filter.exclusions: + if exclusion.funnel_from_step != 0 or exclusion.funnel_to_step != max_steps - 1: + raise ValidationError("Partial Exclusions not allowed in unordered funnels") + + breakdown_clause = self._get_breakdown_prop() + + return f""" + SELECT {self._get_count_columns(max_steps)} {self._get_step_time_avgs(max_steps)} {self._get_step_time_median(max_steps)} {breakdown_clause} FROM ( + {self.get_step_counts_query()} + ) {'GROUP BY prop' if breakdown_clause != '' else ''} SETTINGS allow_experimental_window_functions = 1 + """ + + def get_step_counts_query(self): + + max_steps = len(self._filter.entities) + + union_query = self.get_step_counts_without_aggregation_query() + breakdown_clause = self._get_breakdown_prop() + inner_timestamps, outer_timestamps = self._get_timestamp_selects() + + return f""" + SELECT aggregation_target, steps {self._get_step_time_avgs(max_steps, inner_query=True)} {self._get_step_time_median(max_steps, inner_query=True)} {breakdown_clause} {outer_timestamps} FROM ( + SELECT aggregation_target, steps, max(steps) over (PARTITION BY aggregation_target {breakdown_clause}) as max_steps {self._get_step_time_names(max_steps)} {breakdown_clause} {inner_timestamps} FROM ( + {union_query} + ) + ) GROUP BY aggregation_target, steps {breakdown_clause} + HAVING steps = max_steps + """ + + def get_step_counts_without_aggregation_query(self): + max_steps = len(self._filter.entities) + union_queries = [] + entities_to_use = list(self._filter.entities) + + partition_select = self._get_partition_cols(1, max_steps) + sorting_condition = self.get_sorting_condition(max_steps) + breakdown_clause = self._get_breakdown_prop(group_remaining=True) + exclusion_clause = self._get_exclusion_condition() + + for i in range(max_steps): + inner_query = f""" + SELECT + aggregation_target, + timestamp, + {partition_select} + {breakdown_clause} + FROM ({self._get_inner_event_query(entities_to_use, f"events_{i}")}) + """ + + formatted_query = f""" + SELECT *, {sorting_condition} AS steps {exclusion_clause} {self._get_step_times(max_steps)} FROM ( + {inner_query} + ) WHERE step_0 = 1 + {'AND exclusion = 0' if exclusion_clause else ''} + """ + + #  rotate entities by 1 to get new first event + entities_to_use.append(entities_to_use.pop(0)) + union_queries.append(formatted_query) + + return " UNION ALL ".join(union_queries) + + def _get_step_times(self, max_steps: int): + conditions: List[str] = [] + + conversion_times_elements = [] + for i in range(max_steps): + conversion_times_elements.append(f"latest_{i}") + + conditions.append(f"arraySort([{','.join(conversion_times_elements)}]) as conversion_times") + + for i in range(1, max_steps): + conditions.append( + f"if(isNotNull(conversion_times[{i+1}]) AND conversion_times[{i+1}] <= conversion_times[{i}] + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, " + f"dateDiff('second', conversion_times[{i}], conversion_times[{i+1}]), NULL) step_{i}_conversion_time" + ) + # array indices in ClickHouse are 1-based :shrug: + + formatted = ", ".join(conditions) + return f", {formatted}" if formatted else "" + + def get_sorting_condition(self, max_steps: int): + + conditions = [] + + event_times_elements = [] + for i in range(max_steps): + event_times_elements.append(f"latest_{i}") + + conditions.append(f"arraySort([{','.join(event_times_elements)}]) as event_times") + # replacement of latest_i for whatever query part requires it, just like conversion_times + basic_conditions: List[str] = [] + for i in range(1, max_steps): + basic_conditions.append( + f"if(latest_0 < latest_{i} AND latest_{i} <= latest_0 + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, 1, 0)" + ) + + conditions.append(f"arraySum([{','.join(basic_conditions)}, 1])") + + if basic_conditions: + return ",".join(conditions) + else: + return "1" + + def _get_exclusion_condition(self): + if not self._filter.exclusions: + return "" + + conditions = [] + for exclusion_id, exclusion in enumerate(self._filter.exclusions): + from_time = f"latest_{exclusion.funnel_from_step}" + to_time = f"event_times[{cast(int, exclusion.funnel_to_step) + 1}]" + exclusion_time = f"exclusion_{exclusion_id}_latest_{exclusion.funnel_from_step}" + condition = ( + f"if( {exclusion_time} > {from_time} AND {exclusion_time} < " + f"if(isNull({to_time}), {from_time} + INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}, {to_time}), 1, 0)" + ) + conditions.append(condition) + + if conditions: + return f", arraySum([{','.join(conditions)}]) as exclusion" + else: + return "" diff --git a/ee/clickhouse/queries/funnels/funnel_unordered_persons.py b/ee/clickhouse/queries/funnels/funnel_unordered_persons.py new file mode 100644 index 0000000000000..74fbcbba23bd9 --- /dev/null +++ b/ee/clickhouse/queries/funnels/funnel_unordered_persons.py @@ -0,0 +1,23 @@ +from typing import cast + +from ee.clickhouse.queries.funnels.funnel_unordered import ClickhouseFunnelUnordered +from ee.clickhouse.sql.funnels.funnel import FUNNEL_PERSONS_BY_STEP_SQL +from posthog.models import Person + + +class ClickhouseFunnelUnorderedPersons(ClickhouseFunnelUnordered): + def get_query(self): + return FUNNEL_PERSONS_BY_STEP_SQL.format( + offset=self._filter.offset, + steps_per_person_query=self.get_step_counts_query(), + persons_steps=self._get_funnel_person_step_condition(), + extra_fields=self._get_timestamp_outer_select(), + limit="" if self._no_person_limit else "LIMIT %(limit)s", + ) + + def _format_results(self, results): + people = Person.objects.filter(team_id=self._team.pk, uuid__in=[val[0] for val in results]) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data, len(results) > cast(int, self._filter.limit) - 1 diff --git a/ee/clickhouse/queries/funnels/test/__init__.py b/ee/clickhouse/queries/funnels/test/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel.ambr b/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel.ambr new file mode 100644 index 0000000000000..488af9f12e6de --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel.ambr @@ -0,0 +1,1262 @@ +# name: TestClickhouseFunnel.test_funnel_aggregation_with_groups + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 1) step_people_1, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 2) step_people_2, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target) as max_steps, + step_1_conversion_time + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'user signed up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'paid', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1 + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND event IN ['paid', 'user signed up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-14 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1) )) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestClickhouseFunnel.test_funnel_group_aggregation_with_groups_entity_filtering + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 1) step_people_1, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 2) step_people_2, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target) as max_steps, + step_1_conversion_time + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'user signed up' + AND has(['org:5'], $group_0) + AND team_id = 2, 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'paid', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1 + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND event IN ['paid', 'user signed up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-14 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1) )) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestClickhouseFunnel.test_funnel_with_groups_entity_filtering + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 1) step_people_1, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 2) step_people_2, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target) as max_steps, + step_1_conversion_time + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'user signed up' + AND has(['org:5'], $group_0) + AND team_id = 2, 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'paid', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1 + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + e.$group_0 as $group_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND event IN ['paid', 'user signed up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-14 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1) )) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestClickhouseFunnel.test_funnel_with_groups_global_filtering + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 1) step_people_1, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 2) step_people_2, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target) as max_steps, + step_1_conversion_time + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 14 DAY, 2, 1) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 14 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'user signed up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'paid', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1 + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['paid', 'user signed up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-14 23:59:59' + AND has(['finance'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) ) events + WHERE (step_0 = 1 + OR step_1 = 1) )) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelBreakdown.test_funnel_aggregate_by_groups_breakdown_group + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND e.team_id = 2 + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelBreakdown.test_funnel_aggregate_by_groups_breakdown_group.1 + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + countIf(steps = 3) step_3, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 1) step_people_1, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 2) step_people_2, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 3) step_people_3, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + avg(step_2_average_conversion_time_inner) step_2_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + median(step_2_median_conversion_time_inner) step_2_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time, + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + if(latest_2 < latest_1, NULL, latest_2) as latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['finance', 'technology'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )))) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) + GROUP BY prop SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.1 + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + countIf(steps = 3) step_3, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 1) step_people_1, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 2) step_people_2, + groupArrayIf(100)(DISTINCT aggregation_target, + steps = 3) step_people_3, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + avg(step_2_average_conversion_time_inner) step_2_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + median(step_2_median_conversion_time_inner) step_2_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time, + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + if(latest_2 < latest_1, NULL, latest_2) as latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )))) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) + GROUP BY prop SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.2 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.3 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time, + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + if(latest_2 < latest_1, NULL, latest_2) as latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )))) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) + WHERE steps IN [1, 2, 3] + AND prop IN ['finance'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.4 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.5 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time, + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + if(latest_2 < latest_1, NULL, latest_2) as latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )))) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) + WHERE steps IN [2, 3] + AND prop IN ['finance'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.6 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.7 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time, + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + if(latest_2 < latest_1, NULL, latest_2) as latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )))) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) + WHERE steps IN [1, 2, 3] + AND prop IN ['technology'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.8 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelBreakdown.test_funnel_breakdown_group.9 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps , + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time, + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + latest_1, + step_2, + if(latest_2 < latest_1, NULL, latest_2) as latest_2 , + prop + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )))) + WHERE step_0 = 1 SETTINGS allow_experimental_window_functions = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps SETTINGS allow_experimental_window_functions = 1) + WHERE steps IN [2, 3] + AND prop IN ['technology'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- diff --git a/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel_strict.ambr b/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel_strict.ambr new file mode 100644 index 0000000000000..3cdcd10d1405a --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel_strict.ambr @@ -0,0 +1,773 @@ +# name: TestFunnelStrictStepsBreakdown.test_funnel_aggregate_by_groups_breakdown_group + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND e.team_id = 2 + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_aggregate_by_groups_breakdown_group.1 + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + countIf(steps = 3) step_3, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + avg(step_2_average_conversion_time_inner) step_2_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + median(step_2_median_conversion_time_inner) step_2_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps, + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 2 PRECEDING AND 2 PRECEDING) latest_2 , + if(has(['finance', 'technology'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (1=1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + GROUP BY prop SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.1 + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + countIf(steps = 3) step_3, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + avg(step_2_average_conversion_time_inner) step_2_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + median(step_2_median_conversion_time_inner) step_2_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps, + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 2 PRECEDING AND 2 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (1=1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + GROUP BY prop SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.2 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.3 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps, + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 2 PRECEDING AND 2 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (1=1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [1, 2, 3] + AND prop IN ['finance'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.4 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.5 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps, + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 2 PRECEDING AND 2 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (1=1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [2, 3] + AND prop IN ['finance'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.6 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.7 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps, + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 2 PRECEDING AND 2 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (1=1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [1, 2, 3] + AND prop IN ['technology'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.8 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelStrictStepsBreakdown.test_funnel_breakdown_group.9 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY + AND latest_1 < latest_2 + AND latest_2 <= latest_0 + INTERVAL 7 DAY, 3, if(latest_0 < latest_1 + AND latest_1 <= latest_0 + INTERVAL 7 DAY, 2, 1)) AS steps, + if(isNotNull(latest_1) + AND latest_1 <= latest_0 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_0), toDateTime(latest_1)), NULL) step_1_conversion_time, + if(isNotNull(latest_2) + AND latest_2 <= latest_1 + INTERVAL 7 DAY, dateDiff('second', toDateTime(latest_1), toDateTime(latest_2)), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN 2 PRECEDING AND 2 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (1=1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [2, 3] + AND prop IN ['technology'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- diff --git a/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel_unordered.ambr b/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel_unordered.ambr new file mode 100644 index 0000000000000..9802ad1372551 --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/__snapshots__/test_funnel_unordered.ambr @@ -0,0 +1,2068 @@ +# name: TestFunnelUnorderedSteps.test_unordered_funnel_with_groups + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target) as max_steps, + step_1_conversion_time + FROM + (SELECT *, + arraySort([latest_0,latest_1]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 14 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'user signed up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'paid', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1 + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND event IN ['paid', 'user signed up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-14 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 14 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 14 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1 + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'paid', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'user signed up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1 + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND event IN ['paid', 'user signed up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-14 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps + HAVING steps = max_steps) SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_aggregate_by_groups_breakdown_group + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND e.team_id = 2 + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_aggregate_by_groups_breakdown_group.1 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND e.team_id = 2 + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_aggregate_by_groups_breakdown_group.2 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND e.team_id = 2 + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_aggregate_by_groups_breakdown_group.3 + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + countIf(steps = 3) step_3, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + avg(step_2_average_conversion_time_inner) step_2_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + median(step_2_median_conversion_time_inner) step_2_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['finance', 'technology'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['finance', 'technology'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'play movie', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'buy', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'sign up', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['finance', 'technology'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'buy', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'sign up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'play movie', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + e.$group_0 as aggregation_target, + e.$group_0 as $group_0, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + GROUP BY prop SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.1 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.10 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.11 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'play movie', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'buy', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'sign up', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'buy', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'sign up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'play movie', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [2, 3] + AND prop IN ['finance'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.12 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.13 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.14 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.15 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'play movie', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'buy', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'sign up', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'buy', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'sign up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'play movie', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [1, 2, 3] + AND prop IN ['technology'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.16 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.17 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.18 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.19 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'play movie', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'buy', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'sign up', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'buy', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'sign up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'play movie', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [2, 3] + AND prop IN ['technology'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.2 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.3 + ' + + SELECT countIf(steps = 1) step_1, + countIf(steps = 2) step_2, + countIf(steps = 3) step_3, + avg(step_1_average_conversion_time_inner) step_1_average_conversion_time, + avg(step_2_average_conversion_time_inner) step_2_average_conversion_time, + median(step_1_median_conversion_time_inner) step_1_median_conversion_time, + median(step_2_median_conversion_time_inner) step_2_median_conversion_time, + prop + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'play movie', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'buy', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'sign up', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'buy', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'sign up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'play movie', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + GROUP BY prop SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.4 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.5 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.6 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.7 + ' + + SELECT aggregation_target as person_id + FROM + (SELECT aggregation_target, + steps, + avg(step_1_conversion_time) step_1_average_conversion_time_inner, + avg(step_2_conversion_time) step_2_average_conversion_time_inner, + median(step_1_conversion_time) step_1_median_conversion_time_inner, + median(step_2_conversion_time) step_2_median_conversion_time_inner, + prop + FROM + (SELECT aggregation_target, + steps, + max(steps) over (PARTITION BY aggregation_target, + prop) as max_steps, + step_1_conversion_time, + step_2_conversion_time, + prop + FROM + (SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'sign up', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'play movie', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'buy', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'play movie', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'buy', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'sign up', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 + UNION ALL SELECT *, + arraySort([latest_0,latest_1,latest_2]) as event_times, + arraySum([if(latest_0 < latest_1 AND latest_1 <= latest_0 + INTERVAL 7 DAY, 1, 0),if(latest_0 < latest_2 AND latest_2 <= latest_0 + INTERVAL 7 DAY, 1, 0), 1]) AS steps , + arraySort([latest_0,latest_1,latest_2]) as conversion_times, + if(isNotNull(conversion_times[2]) + AND conversion_times[2] <= conversion_times[1] + INTERVAL 7 DAY, dateDiff('second', conversion_times[1], conversion_times[2]), NULL) step_1_conversion_time, + if(isNotNull(conversion_times[3]) + AND conversion_times[3] <= conversion_times[2] + INTERVAL 7 DAY, dateDiff('second', conversion_times[2], conversion_times[3]), NULL) step_2_conversion_time + FROM + (SELECT aggregation_target, + timestamp, + step_0, + latest_0, + step_1, + min(latest_1) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_1, + step_2, + min(latest_2) over (PARTITION by aggregation_target, + prop + ORDER BY timestamp DESC ROWS BETWEEN UNBOUNDED PRECEDING AND 0 PRECEDING) latest_2 , + if(has(['technology', 'finance'], prop), prop, 'Other') as prop + FROM + (SELECT aggregation_target, + timestamp, + if(event = 'buy', 1, 0) as step_0, + if(step_0 = 1, timestamp, null) as latest_0, + if(event = 'sign up', 1, 0) as step_1, + if(step_1 = 1, timestamp, null) as latest_1, + if(event = 'play movie', 1, 0) as step_2, + if(step_2 = 1, timestamp, null) as latest_2, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS prop + FROM + (SELECT e.event as event, + e.team_id as team_id, + e.distinct_id as distinct_id, + e.timestamp as timestamp, + pdi.person_id as aggregation_target, + groups_0.group_properties_0 as group_properties_0 + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event IN ['buy', 'play movie', 'sign up'] + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' ) events + WHERE (step_0 = 1 + OR step_1 = 1 + OR step_2 = 1) )) + WHERE step_0 = 1 )) + GROUP BY aggregation_target, + steps, + prop + HAVING steps = max_steps) + WHERE steps IN [1, 2, 3] + AND prop IN ['finance'] + ORDER BY aggregation_target + LIMIT 100 + OFFSET 0 SETTINGS allow_experimental_window_functions = 1 + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.8 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- +# name: TestFunnelUnorderedStepsBreakdown.test_funnel_breakdown_group.9 + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-08 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 10 + OFFSET 0) + ' +--- diff --git a/ee/clickhouse/queries/funnels/test/breakdown_cases.py b/ee/clickhouse/queries/funnels/test/breakdown_cases.py new file mode 100644 index 0000000000000..2046c60d286d0 --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/breakdown_cases.py @@ -0,0 +1,1703 @@ +from string import ascii_lowercase +from uuid import UUID + +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.models.group import create_group +from ee.clickhouse.queries.breakdown_props import ALL_USERS_COHORT_ID +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel +from ee.clickhouse.util import snapshot_clickhouse_queries +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.cohort import Cohort +from posthog.models.filters import Filter +from posthog.models.group_type_mapping import GroupTypeMapping +from posthog.test.base import APIBaseTest, test_with_materialized_columns + + +def funnel_breakdown_test_factory(Funnel, FunnelPerson, _create_event, _create_action, _create_person): + class TestFunnelBreakdown(APIBaseTest): + def _get_people_at_step(self, filter, funnel_step, breakdown_value=None): + person_filter = filter.with_data({"funnel_step": funnel_step, "funnel_step_breakdown": breakdown_value}) + result = FunnelPerson(person_filter, self.team)._exec_query() + return [row[0] for row in result] + + @test_with_materialized_columns(["$browser"]) + def test_funnel_step_breakdown_event(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + result = funnel.run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600.0, + "median_conversion_time": 3600.0, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Chrome"), [person1.uuid]) + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person2.uuid, person3.uuid] + if Funnel == ClickhouseFunnel + else [], # backwards compatibility + "count": 2, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person2.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person2.uuid, person3.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Safari"), [person2.uuid]) + + @test_with_materialized_columns(["$browser"]) + def test_funnel_step_breakdown_event_with_other(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "$browser", + "breakdown_limit": 1, + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + person4 = _create_person(distinct_ids=["person4"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person4", + properties={"key": "val", "$browser": "random"}, + timestamp="2020-01-02T14:00:00Z", + ) + + person5 = _create_person(distinct_ids=["person5"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person5", + properties={"key": "val", "$browser": "another one"}, + timestamp="2020-01-02T15:00:00Z", + ) + + result = funnel.run() + + people = result[0][0].pop("people") + self.assertCountEqual( + people, [person1.uuid, person4.uuid, person5.uuid] if Funnel == ClickhouseFunnel else [] + ) + + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + # popped people because flakey ordering for assertEqual + "count": 3, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Other", + "breakdown_value": "Other", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600.0, + "median_conversion_time": 3600.0, + "breakdown": "Other", + "breakdown_value": "Other", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Other", + "breakdown_value": "Other", + }, + ], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 1, "Other"), [person1.uuid, person4.uuid, person5.uuid] + ) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Other"), [person1.uuid]) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person2.uuid, person3.uuid] + if Funnel == ClickhouseFunnel + else [], # backwards compatibility + "count": 2, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person2.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person2.uuid, person3.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Safari"), [person2.uuid]) + + @test_with_materialized_columns(["$browser"]) + def test_funnel_step_breakdown_event_no_type(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + result = funnel.run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600.0, + "median_conversion_time": 3600.0, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Chrome"), [person1.uuid]) + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person2.uuid, person3.uuid] + if Funnel == ClickhouseFunnel + else [], # backwards compatibility + "count": 2, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person2.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person2.uuid, person3.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Safari"), [person2.uuid]) + + @test_with_materialized_columns(person_properties=["$browser"]) + def test_funnel_step_breakdown_person(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "person", + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk, properties={"$browser": "Chrome"}) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk, properties={"$browser": "Safari"}) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val"}, + timestamp="2020-01-02T16:00:00Z", + ) + + result = funnel.run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600.0, + "median_conversion_time": 3600.0, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Chrome"), [person1.uuid]) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person2.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person2.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person2.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 3, "Safari"), []) + + @test_with_materialized_columns(["some_breakdown_val"]) + def test_funnel_step_breakdown_limit(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "some_breakdown_val", + "breakdown_limit": 5, + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + for num in range(10): + for i in range(num): + _create_person(distinct_ids=[f"person_{num}_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T15:00:00Z", + ) + + result = funnel.run() + + # assert that we give 5 at a time at most and that those values are the most popular ones + breakdown_vals = sorted([res[0]["breakdown"] for res in result]) + self.assertEqual(["5", "6", "7", "8", "9", "Other"], breakdown_vals) + + @test_with_materialized_columns(["some_breakdown_val"]) + def test_funnel_step_custom_breakdown_limit_with_nulls(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown_limit": 3, + "breakdown": "some_breakdown_val", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + for num in range(5): + for i in range(num): + _create_person(distinct_ids=[f"person_{num}_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T15:00:00Z", + ) + + # no breakdown value for this guy + person0 = _create_person(distinct_ids=[f"person_null"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id=f"person_null", + properties={"key": "val"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id=f"person_null", + properties={"key": "val"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id=f"person_null", + properties={"key": "val"}, + timestamp="2020-01-01T15:00:00Z", + ) + + result = funnel.run() + + breakdown_vals = sorted([res[0]["breakdown"] for res in result]) + self.assertEqual(["2", "3", "4", "Other"], breakdown_vals) + # skipped 1 and '' because the limit was 3. + self.assertTrue(person0.uuid in self._get_people_at_step(filter, 1, "Other")) + + @test_with_materialized_columns(["some_breakdown_val"]) + def test_funnel_step_custom_breakdown_limit_with_nulls_included(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown_limit": 6, + "breakdown": "some_breakdown_val", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + for num in range(5): + for i in range(num): + _create_person(distinct_ids=[f"person_{num}_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id=f"person_{num}_{i}", + properties={"key": "val", "some_breakdown_val": num}, + timestamp="2020-01-01T15:00:00Z", + ) + + # no breakdown value for this guy + p_null = _create_person(distinct_ids=[f"person_null"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id=f"person_null", + properties={"key": "val"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id=f"person_null", + properties={"key": "val"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id=f"person_null", + properties={"key": "val"}, + timestamp="2020-01-01T15:00:00Z", + ) + + result = funnel.run() + + breakdown_vals = sorted([res[0]["breakdown"] for res in result]) + self.assertEqual(["", "1", "2", "3", "4"], breakdown_vals) + # included 1 and '' because the limit was 6. + + for i in range(1, 5): + self.assertEqual(len(self._get_people_at_step(filter, 3, str(i))), i) + + self.assertEqual([p_null.uuid], self._get_people_at_step(filter, 1, "")) + self.assertEqual([p_null.uuid], self._get_people_at_step(filter, 3, "")) + + @test_with_materialized_columns(["$browser"]) + def test_funnel_step_breakdown_event_single_person_multiple_breakdowns(self): + + filters = { + "events": [{"id": "sign up", "order": 0}], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T13:00:00Z", + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Mac"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": 0}, # mixed property type! + timestamp="2020-01-02T15:00:00Z", + ) + + result = funnel.run() + result = sorted(result, key=lambda res: res[0]["breakdown"]) + + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "0", + "breakdown_value": "0", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "0"), [person1.uuid]) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + + self.assertEqual( + result[2], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Mac", + "breakdown_value": "Mac", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Mac"), [person1.uuid]) + + self.assertEqual( + result[3], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person1.uuid]) + + def test_funnel_step_breakdown_event_single_person_events_with_multiple_properties(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T12:30:00Z", + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T13:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + result = funnel.run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Chrome"), []) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600, + "median_conversion_time": 3600, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Safari"), [person1.uuid]) + + @test_with_materialized_columns(person_properties=["key"], verify_no_jsonextract=False) + def test_funnel_cohort_breakdown(self): + # This caused some issues with SQL parsing + person = _create_person(distinct_ids=[f"person1"], team_id=self.team.pk, properties={"key": "value"}) + _create_event( + team=self.team, + event="sign up", + distinct_id=f"person1", + properties={}, + timestamp="2020-01-02T12:00:00Z", + ) + cohort = Cohort.objects.create( + team=self.team, + name="test_cohort", + groups=[{"properties": [{"key": "key", "value": "value", "type": "person"}]}], + ) + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "cohort", + "breakdown": ["all", cohort.pk], + } + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + self.assertEqual(len(result[0]), 3) + self.assertEqual(result[0][0]["breakdown"], "all users") + self.assertEqual(len(result[1]), 3) + self.assertEqual(result[1][0]["breakdown"], "test_cohort") + self.assertCountEqual(self._get_people_at_step(filter, 1, cohort.pk), [person.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, cohort.pk), []) + + self.assertCountEqual(self._get_people_at_step(filter, 1, ALL_USERS_COHORT_ID), [person.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, ALL_USERS_COHORT_ID), []) + + # non array + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "cohort", + "breakdown": cohort.pk, + } + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + self.assertEqual(len(result[0]), 3) + self.assertEqual(result[0][0]["breakdown"], "test_cohort") + self.assertEqual(result[0][0]["breakdown_value"], cohort.pk) + self.assertCountEqual(self._get_people_at_step(filter, 1, cohort.pk), [person.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, cohort.pk), []) + + def test_basic_funnel_default_funnel_days_breakdown_event(self): + person = _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$current_url": "https://posthog.com/docs/x"}, + ) + _create_event( + team=self.team, + event="paid", + distinct_id="user_1", + timestamp="2020-01-10T14:00:00Z", + properties={"$current_url": "https://posthog.com/docs/x"}, + ) + + # Dummy events to make sure that breakdown is not confused + # It was confused before due to the nature of fetching breakdown values with a LIMIT based on value popularity + # See https://github.com/PostHog/posthog/pull/5496 + for current_url_letter in ascii_lowercase[:20]: + # Twenty dummy breakdown values + for _ in range(2): + # Each twice, so that the breakdown values from dummy events rank higher in raw order + # This test makes sure that events are prefiltered properly to avoid problems with this raw order + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$current_url": f"https://posthog.com/blog/{current_url_letter}"}, + ) + + filters = { + "events": [ + { + "id": "user signed up", + "type": "events", + "order": 0, + "properties": [ + { + "key": "$current_url", + "operator": "icontains", + "type": "event", + "value": "https://posthog.com/docs", + } + ], + }, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "breakdown": "$current_url", + "breakdown_type": "event", + } + + result = ClickhouseFunnel(Filter(data=filters), self.team).run() + + self.assertEqual( + result, + [ + [ + { + "action_id": "user signed up", + "average_conversion_time": None, + "breakdown": "https://posthog.com/docs/x", + "breakdown_value": "https://posthog.com/docs/x", + "count": 1, + "median_conversion_time": None, + "name": "user signed up", + "custom_name": None, + "order": 0, + "people": [UUID(bytes=person.uuid.bytes)], + "type": "events", + }, + { + "action_id": "paid", + "average_conversion_time": 691200.0, + "breakdown": "https://posthog.com/docs/x", + "breakdown_value": "https://posthog.com/docs/x", + "count": 1, + "median_conversion_time": 691200.0, + "name": "paid", + "custom_name": None, + "order": 1, + "people": [UUID(bytes=person.uuid.bytes)], + "type": "events", + }, + ] + ], + ) + + @test_with_materialized_columns(["$current_url"]) + def test_basic_funnel_default_funnel_days_breakdown_action(self): + # Same case as test_basic_funnel_default_funnel_days_breakdown_event but with an action + user_signed_up_action = _create_action(name="user signed up", event="user signed up", team=self.team,) + person = _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$current_url": "https://posthog.com/docs/x"}, + ) + _create_event( + team=self.team, + event="paid", + distinct_id="user_1", + timestamp="2020-01-10T14:00:00Z", + properties={"$current_url": "https://posthog.com/docs/x"}, + ) + + for current_url_letter in ascii_lowercase[:20]: + for _ in range(2): + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$current_url": f"https://posthog.com/blog/{current_url_letter}"}, + ) + + filters = { + "actions": [ + { + "id": user_signed_up_action.id, + "order": 0, + "properties": [ + { + "key": "$current_url", + "operator": "icontains", + "type": "event", + "value": "https://posthog.com/docs", + } + ], + } + ], + "events": [{"id": "paid", "type": "events", "order": 1},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "breakdown": "$current_url", + "breakdown_type": "event", + } + + result = ClickhouseFunnel(Filter(data=filters), self.team).run() + + self.assertEqual( + result, + [ + [ + { + "action_id": user_signed_up_action.id, + "average_conversion_time": None, + "breakdown": "https://posthog.com/docs/x", + "breakdown_value": "https://posthog.com/docs/x", + "count": 1, + "median_conversion_time": None, + "name": "user signed up", + "custom_name": None, + "order": 0, + "people": [UUID(bytes=person.uuid.bytes)], + "type": "actions", + }, + { + "action_id": "paid", + "average_conversion_time": 691200.0, + "breakdown": "https://posthog.com/docs/x", + "breakdown_value": "https://posthog.com/docs/x", + "count": 1, + "median_conversion_time": 691200.0, + "name": "paid", + "custom_name": None, + "order": 1, + "people": [UUID(bytes=person.uuid.bytes)], + "type": "events", + }, + ] + ], + ) + + def _create_groups(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group( + team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"} + ) + create_group( + team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"} + ) + create_group(team_id=self.team.pk, group_type_index=1, group_key="org:5", properties={"industry": "random"}) + + @snapshot_clickhouse_queries + def test_funnel_breakdown_group(self): + self._create_groups() + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"$group_0": "org:5", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"$group_0": "org:5", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"$group_0": "org:5", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"$group_0": "org:6", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"$group_0": "org:6", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"$group_0": "org:6", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown": "industry", + "breakdown_type": "group", + "breakdown_group_type_index": 0, + } + + filter = Filter(data=filters, team=self.team) + result = Funnel(filter, self.team).run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "finance", + "breakdown_value": "finance", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600.0, + "median_conversion_time": 3600.0, + "breakdown": "finance", + "breakdown_value": "finance", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [person1.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "finance", + "breakdown_value": "finance", + }, + ], + ) + # Querying persons when aggregating by persons should be ok, despite group breakdown + self.assertCountEqual(self._get_people_at_step(filter, 1, "finance"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "finance"), [person1.uuid]) + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [person2.uuid, person3.uuid] + if Funnel == ClickhouseFunnel + else [], # backwards compatibility + "count": 2, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "technology", + "breakdown_value": "technology", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [person2.uuid] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "technology", + "breakdown_value": "technology", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "technology", + "breakdown_value": "technology", + }, + ], + ) + + self.assertCountEqual(self._get_people_at_step(filter, 1, "technology"), [person2.uuid, person3.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "technology"), [person2.uuid]) + + @snapshot_clickhouse_queries + def test_funnel_aggregate_by_groups_breakdown_group(self): + self._create_groups() + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"$group_0": "org:5", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"$group_0": "org:5", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"$group_0": "org:5", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"$group_0": "org:6", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"$group_0": "org:6", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="buy", + distinct_id="person3", + properties={"$group_0": "org:6", "$browser": "Safari"}, + timestamp="2020-01-02T18:00:00Z", + ) + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown": "industry", + "breakdown_type": "group", + "breakdown_group_type_index": 0, + "aggregation_group_type_index": 0, + } + + result = Funnel(Filter(data=filters, team=self.team), self.team).run() + + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": ["org:5"] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "finance", + "breakdown_value": "finance", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": ["org:5"] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 3600.0, + "median_conversion_time": 3600.0, + "breakdown": "finance", + "breakdown_value": "finance", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": ["org:5"] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "finance", + "breakdown_value": "finance", + }, + ], + ) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": ["org:6"] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "technology", + "breakdown_value": "technology", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": ["org:6"] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200.0, + "median_conversion_time": 7200.0, + "breakdown": "technology", + "breakdown_value": "technology", + }, + { + "action_id": "buy", + "name": "buy", + "custom_name": None, + "order": 2, + "people": ["org:6"] if Funnel == ClickhouseFunnel else [], # backwards compatibility + "count": 1, + "type": "events", + "average_conversion_time": 7200, + "median_conversion_time": 7200, + "breakdown": "technology", + "breakdown_value": "technology", + }, + ], + ) + + return TestFunnelBreakdown diff --git a/ee/clickhouse/queries/funnels/test/conversion_time_cases.py b/ee/clickhouse/queries/funnels/test/conversion_time_cases.py new file mode 100644 index 0000000000000..f646ba257a33a --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/conversion_time_cases.py @@ -0,0 +1,248 @@ +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.filters import Filter +from posthog.test.base import APIBaseTest + + +def funnel_conversion_time_test_factory(Funnel, FunnelPerson, _create_event, _create_person): + class TestFunnelConversionTime(APIBaseTest): + def _get_people_at_step(self, filter, funnel_step): + person_filter = filter.with_data({"funnel_step": funnel_step}) + result = FunnelPerson(person_filter, self.team)._exec_query() + return [row[0] for row in result] + + def test_funnel_with_multiple_incomplete_tries(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1}, + {"id": "something else", "type": "events", "order": 2}, + ], + "funnel_window_days": 1, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "insight": INSIGHT_FUNNELS, + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00" + ) + _create_event(team=self.team, event="$pageview", distinct_id="person1", timestamp="2021-05-01 02:00:00") + _create_event( + team=self.team, event="something else", distinct_id="person1", timestamp="2021-05-01 03:00:00" + ) + # person1 completed funnel on 2021-05-01 + + _create_event( + team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-03 04:00:00" + ) + _create_event(team=self.team, event="$pageview", distinct_id="person1", timestamp="2021-05-03 06:00:00") + # person1 completed part of funnel on 2021-05-03 and took 2 hours to convert + + _create_event( + team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-04 07:00:00" + ) + _create_event(team=self.team, event="$pageview", distinct_id="person1", timestamp="2021-05-04 10:00:00") + # person1 completed part of funnel on 2021-05-04 and took 3 hours to convert + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[2]["name"], "something else") + self.assertEqual(result[0]["count"], 1) + self.assertEqual( + result[1]["average_conversion_time"], 3600 + ) # one hour to convert, disregard the incomplete tries + self.assertEqual(result[1]["median_conversion_time"], 3600) + + # check ordering of people in every step + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid,], + ) + + def test_funnel_step_conversion_times(self): + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person3", + properties={"key": "val"}, + timestamp="2020-01-02T16:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person3", + properties={"key": "val"}, + timestamp="2020-01-02T17:00:00Z", + ) + + result = funnel.run() + + self.assertEqual(result[0]["average_conversion_time"], None) + self.assertEqual(result[1]["average_conversion_time"], 6000) + self.assertEqual(result[2]["average_conversion_time"], 5400) + + self.assertEqual(result[0]["median_conversion_time"], None) + self.assertEqual(result[1]["median_conversion_time"], 7200) + self.assertEqual(result[2]["median_conversion_time"], 5400) + + def test_funnel_times_with_different_conversion_windows(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "pageview", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_interval": 14, + "funnel_window_interval_unit": "day", + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + + filter = Filter(data=filters) + funnel = Funnel(filter, self.team) + + # event + person1_stopped_after_two_signups = _create_person( + distinct_ids=["stopped_after_signup1"], team_id=self.team.pk + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_signup1", + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="pageview", distinct_id="stopped_after_signup1", timestamp="2020-01-02T14:05:00Z" + ) + + person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_signup2", + timestamp="2020-01-02T14:03:00Z", + ) + + person3_stopped_after_two_signups = _create_person( + distinct_ids=["stopped_after_signup3"], team_id=self.team.pk + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_signup3", + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + team=self.team, event="pageview", distinct_id="stopped_after_signup3", timestamp="2020-01-02T12:15:00Z" + ) + + result = funnel.run() + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 3) + self.assertEqual(result[1]["count"], 2) + self.assertEqual(result[1]["average_conversion_time"], 600) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_two_signups.uuid, + person2_stopped_after_signup.uuid, + person3_stopped_after_two_signups.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [person1_stopped_after_two_signups.uuid, person3_stopped_after_two_signups.uuid], + ) + + filter = filter.with_data({"funnel_window_interval": 5, "funnel_window_interval_unit": "minute"}) + + funnel = Funnel(filter, self.team) + result4 = funnel.run() + + self.assertNotEqual(result, result4) + self.assertEqual(result4[0]["name"], "user signed up") + self.assertEqual(result4[0]["count"], 3) + self.assertEqual(result4[1]["count"], 1) + self.assertEqual(result4[1]["average_conversion_time"], 300) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_two_signups.uuid, + person2_stopped_after_signup.uuid, + person3_stopped_after_two_signups.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1_stopped_after_two_signups.uuid], + ) + + return TestFunnelConversionTime diff --git a/ee/clickhouse/queries/funnels/test/test_funnel.py b/ee/clickhouse/queries/funnels/test/test_funnel.py new file mode 100644 index 0000000000000..355833151397e --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel.py @@ -0,0 +1,1635 @@ +from unittest.case import skip +from uuid import uuid4 + +from freezegun.api import freeze_time +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.group import create_group +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel +from ee.clickhouse.queries.funnels.funnel_persons import ClickhouseFunnelPersons +from ee.clickhouse.queries.funnels.test.breakdown_cases import funnel_breakdown_test_factory +from ee.clickhouse.queries.funnels.test.conversion_time_cases import funnel_conversion_time_test_factory +from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries +from posthog.constants import INSIGHT_FUNNELS +from posthog.models import Element +from posthog.models.action import Action +from posthog.models.action_step import ActionStep +from posthog.models.filters import Filter +from posthog.models.group_type_mapping import GroupTypeMapping +from posthog.models.person import Person +from posthog.queries.test.test_funnel import funnel_test_factory +from posthog.test.base import test_with_materialized_columns + +FORMAT_TIME = "%Y-%m-%d 00:00:00" +MAX_STEP_COLUMN = 0 +COUNT_COLUMN = 1 +PERSON_ID_COLUMN = 2 + + +def _create_action(**kwargs): + team = kwargs.pop("team") + name = kwargs.pop("name") + properties = kwargs.pop("properties", {}) + action = Action.objects.create(team=team, name=name) + ActionStep.objects.create(action=action, event=name, properties=properties) + return action + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelBreakdown(ClickhouseTestMixin, funnel_breakdown_test_factory(ClickhouseFunnel, ClickhouseFunnelPersons, _create_event, _create_action, _create_person)): # type: ignore + maxDiff = None + pass + + +class TestFunnelConversionTime(ClickhouseTestMixin, funnel_conversion_time_test_factory(ClickhouseFunnel, ClickhouseFunnelPersons, _create_event, _create_person)): # type: ignore + maxDiff = None + pass + + +class TestClickhouseFunnel(ClickhouseTestMixin, funnel_test_factory(ClickhouseFunnel, _create_event, _create_person)): # type: ignore + + maxDiff = None + + def _get_people_at_step(self, filter, funnel_step, breakdown_value=None): + person_filter = filter.with_data({"funnel_step": funnel_step, "funnel_step_breakdown": breakdown_value}) + result = ClickhouseFunnelPersons(person_filter, self.team)._exec_query() + return [row[0] for row in result] + + def test_basic_funnel_default_funnel_days(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="user_1", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id="user_1", timestamp="2020-01-10T14:00:00Z", + ) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + def test_basic_funnel_with_repeat_steps(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "user signed up", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_two_signups = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup2") + + result = funnel.run() + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1_stopped_after_two_signups.uuid], + ) + + @test_with_materialized_columns(["key"]) + def test_basic_funnel_with_derivative_steps(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0, "properties": {"key": "val"}}, + {"id": "user signed up", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_two_signups = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup1", properties={"key": "val"} + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup2", properties={"key": "val"} + ) + + result = funnel.run() + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1_stopped_after_two_signups.uuid], + ) + + def test_basic_funnel_with_repeat_step_updated_param(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "user signed up", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_interval": 14, + "funnel_window_interval_unit": "day", + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_two_signups = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup2") + + result = funnel.run() + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1_stopped_after_two_signups.uuid], + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "user signed up", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_interval": 2, + "funnel_window_interval_unit": "week", + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + result2 = funnel.run() + self.assertEqual(result, result2) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "user signed up", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + "funnel_window_interval": 1, + "funnel_window_interval_unit": "hour", + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + result3 = funnel.run() + self.assertEqual(result, result3) + + def test_funnel_exclusions_full_window(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "exclusions": [ + {"id": "x 1 name with numbers 2", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}, + ], + } + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event 1 + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="paid", distinct_id="person1", timestamp="2021-05-01 02:00:00") + + # event 2 + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 03:00:00") + _create_event( + team=self.team, event="x 1 name with numbers 2", distinct_id="person2", timestamp="2021-05-01 03:30:00" + ) + _create_event(team=self.team, event="paid", distinct_id="person2", timestamp="2021-05-01 04:00:00") + + # event 3 + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="paid", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + result = funnel.run() + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 2) + self.assertEqual(len(result[1]["people"]), 2) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person3.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1.uuid, person3.uuid], + ) + + def test_advanced_funnel_exclusions_between_steps(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1}, + {"id": "insight viewed", "type": "events", "order": 2}, + {"id": "invite teammate", "type": "events", "order": 3}, + {"id": "pageview2", "type": "events", "order": 4}, + ], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "insight": INSIGHT_FUNNELS, + "exclusions": [{"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1},], + } + + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + # this dude is discarded when funnel_from_step = 1 + # this dude is discarded when funnel_from_step = 2 + # this dude is discarded when funnel_from_step = 3 + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="$pageview", distinct_id="person1", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="x", distinct_id="person1", timestamp="2021-05-01 03:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person1", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="x", distinct_id="person1", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person1", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person1", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person1", timestamp="2021-05-01 06:00:00") + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + # this dude is discarded when funnel_from_step = 2 + # this dude is discarded when funnel_from_step = 3 + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="$pageview", distinct_id="person2", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person2", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="x", distinct_id="person2", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person2", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person2", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person2", timestamp="2021-05-01 06:00:00") + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + # this dude is discarded when funnel_from_step = 0 + # this dude is discarded when funnel_from_step = 3 + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 01:30:00") + _create_event(team=self.team, event="$pageview", distinct_id="person3", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person3", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + + self.assertEqual(result[4]["count"], 2) + self.assertEqual(len(result[4]["people"]), 2) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person2.uuid,], + ) + + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 1, "funnel_to_step": 2}]} + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + + self.assertEqual(result[4]["count"], 2) + self.assertEqual(len(result[4]["people"]), 2) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person2.uuid, person3.uuid,], + ) + + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 2, "funnel_to_step": 3}]} + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person3.uuid,], + ) + + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 3, "funnel_to_step": 4}]} + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 0) + self.assertEqual(len(result[0]["people"]), 0) + + self.assertEqual(result[4]["count"], 0) + self.assertEqual(len(result[4]["people"]), 0) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [], + ) + + #  bigger step window + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 1, "funnel_to_step": 3}]} + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person3.uuid], + ) + + def test_advanced_funnel_with_repeat_steps(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1}, + {"id": "$pageview", "type": "events", "order": 2}, + {"id": "$pageview", "type": "events", "order": 3}, + {"id": "$pageview", "type": "events", "order": 4}, + ], + "insight": INSIGHT_FUNNELS, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview1") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview1") + + person3_stopped_after_two_pageview = _create_person( + distinct_ids=["stopped_after_pageview2"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview2") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview2") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview2") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview2") + + person4_stopped_after_three_pageview = _create_person( + distinct_ids=["stopped_after_pageview3"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview3") + + person5_stopped_after_many_pageview = _create_person( + distinct_ids=["stopped_after_pageview4"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[4]["name"], "$pageview") + self.assertEqual(result[0]["count"], 5) + self.assertEqual(len(result[0]["people"]), 5) + self.assertEqual(result[1]["count"], 4) + self.assertEqual(len(result[1]["people"]), 4) + self.assertEqual(result[2]["count"], 3) + self.assertEqual(len(result[2]["people"]), 3) + self.assertEqual(result[3]["count"], 2) + self.assertEqual(len(result[3]["people"]), 2) + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + # check ordering of people in every step + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [ + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), + [ + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 4), + [person4_stopped_after_three_pageview.uuid, person5_stopped_after_many_pageview.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 5), [person5_stopped_after_many_pageview.uuid], + ) + + def test_advanced_funnel_with_repeat_steps_out_of_order_events(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1}, + {"id": "$pageview", "type": "events", "order": 2}, + {"id": "$pageview", "type": "events", "order": 3}, + {"id": "$pageview", "type": "events", "order": 4}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_signup = _create_person( + distinct_ids=["random", "stopped_after_signup1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="random") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview1") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview1") + + person3_stopped_after_two_pageview = _create_person( + distinct_ids=["stopped_after_pageview2"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview2") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview2") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview2") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview2") + + person4_stopped_after_three_pageview = _create_person( + distinct_ids=["stopped_after_pageview3"], team_id=self.team.pk + ) + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview3") + + person5_stopped_after_many_pageview = _create_person( + distinct_ids=["stopped_after_pageview4"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview4") + + person6_stopped_after_many_pageview_without_signup = _create_person( + distinct_ids=["stopped_after_pageview5"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview5") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview5") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview5") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview5") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview5") + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[4]["name"], "$pageview") + self.assertEqual(result[0]["count"], 5) + self.assertEqual(len(result[0]["people"]), 5) + self.assertEqual(result[1]["count"], 4) + self.assertEqual(len(result[1]["people"]), 4) + self.assertEqual(result[2]["count"], 1) + self.assertEqual(len(result[2]["people"]), 1) + self.assertEqual(result[3]["count"], 1) + self.assertEqual(len(result[3]["people"]), 1) + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + # check ordering of people in every step + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [ + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), [person5_stopped_after_many_pageview.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 4), [person5_stopped_after_many_pageview.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 5), [person5_stopped_after_many_pageview.uuid], + ) + + @test_with_materialized_columns(["key"]) + def test_funnel_with_actions(self): + + sign_up_action = _create_action( + name="sign up", + team=self.team, + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + + filters = { + "actions": [ + {"id": sign_up_action.id, "math": "dau", "order": 0}, + {"id": sign_up_action.id, "math": "wau", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_two_signups = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="sign up", distinct_id="stopped_after_signup1", properties={"key": "val"}) + _create_event(team=self.team, event="sign up", distinct_id="stopped_after_signup1", properties={"key": "val"}) + + person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk) + _create_event(team=self.team, event="sign up", distinct_id="stopped_after_signup2", properties={"key": "val"}) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "sign up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + # check ordering of people in first step + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1_stopped_after_two_signups.uuid], + ) + + @test_with_materialized_columns(["key"]) + @skip("Flaky funnel test") + def test_funnel_with_actions_and_events(self): + + sign_up_action = _create_action( + name="sign up", + team=self.team, + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "user signed up", "type": "events", "order": 1}, + ], + "actions": [ + {"id": sign_up_action.id, "math": "dau", "order": 2}, + {"id": sign_up_action.id, "math": "wau", "order": 3}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + } + + filter = Filter(data=filters, team=self.team) + + # event + person1_stopped_after_two_signups = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2021-05-01 00:00:00" + ) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2021-05-01 00:00:01" + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="stopped_after_signup1", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:02", + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="stopped_after_signup1", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:03", + ) + + person2_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup2"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup2", timestamp="2021-05-01 00:00:04" + ) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup2", timestamp="2021-05-01 00:00:05" + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="stopped_after_signup2", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:06", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 00:00:07") + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:08", + ) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 00:00:09") + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:10", + ) + + person4 = _create_person(distinct_ids=["person4"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person4", timestamp="2021-05-01 00:00:11") + _create_event( + team=self.team, + event="sign up", + distinct_id="person4", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:12", + ) + _create_event(team=self.team, event="user signed up", distinct_id="person4", timestamp="2021-05-01 00:00:13") + + person5 = _create_person(distinct_ids=["person5"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person5", + properties={"key": "val"}, + timestamp="2021-05-01 00:00:14", + ) + + with freeze_time("2021-05-02"): + result = ClickhouseFunnel(filter, self.team).run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 4) + self.assertEqual(result[1]["count"], 4) + self.assertEqual(result[2]["count"], 3) + self.assertEqual(result[3]["count"], 1) + + # check ordering of people in steps + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid, person3.uuid, person4.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid, person3.uuid, person4.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), + [person1_stopped_after_two_signups.uuid, person2_stopped_after_signup.uuid, person3.uuid,], + ) + + self.assertCountEqual(self._get_people_at_step(filter, 4), [person1_stopped_after_two_signups.uuid,]) + + @test_with_materialized_columns(["$current_url"]) + def test_funnel_with_matching_properties(self): + filters = { + "events": [ + {"id": "user signed up", "order": 0}, + {"id": "$pageview", "order": 1, "properties": {"$current_url": "aloha.com"}}, + { + "id": "$pageview", + "order": 2, + "properties": {"$current_url": "aloha2.com"}, + }, # different event to above + {"id": "$pageview", "order": 3, "properties": {"$current_url": "aloha2.com"}}, + {"id": "$pageview", "order": 4,}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview1") + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview1", + properties={"$current_url": "aloha.com"}, + ) + + person3_stopped_after_two_pageview = _create_person( + distinct_ids=["stopped_after_pageview2"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview2") + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview2", + properties={"$current_url": "aloha.com"}, + ) + _create_event( + team=self.team, + event="blaah blaa", + distinct_id="stopped_after_pageview2", + properties={"$current_url": "aloha.com"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview2", + properties={"$current_url": "aloha2.com"}, + ) + + person4_stopped_after_three_pageview = _create_person( + distinct_ids=["stopped_after_pageview3"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview3") + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview3", + properties={"$current_url": "aloha.com"}, + ) + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview3") + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview3", + properties={"$current_url": "aloha2.com"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview3", + properties={"$current_url": "aloha2.com"}, + ) + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview3") + + person5_stopped_after_many_pageview = _create_person( + distinct_ids=["stopped_after_pageview4"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview4") + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview4", + properties={"$current_url": "aloha.com"}, + ) + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_pageview4") + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview4", + properties={"$current_url": "aloha2.com"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview4", + properties={"$current_url": "aloha.com"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="stopped_after_pageview4", + properties={"$current_url": "aloha2.com"}, + ) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[4]["name"], "$pageview") + self.assertEqual(result[0]["count"], 5) + self.assertEqual(result[1]["count"], 4) + self.assertEqual(result[2]["count"], 3) + self.assertEqual(result[3]["count"], 2) + self.assertEqual(result[4]["count"], 0) + # check ordering of people in every step + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [ + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), + [ + person3_stopped_after_two_pageview.uuid, + person4_stopped_after_three_pageview.uuid, + person5_stopped_after_many_pageview.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 4), + [person4_stopped_after_three_pageview.uuid, person5_stopped_after_many_pageview.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 5), [], + ) + + def test_funnel_conversion_window(self): + ids_to_compare = [] + for i in range(10): + person = _create_person(distinct_ids=[f"user_{i}"], team=self.team) + ids_to_compare.append(str(person.uuid)) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-02 00:00:00") + + for i in range(10, 25): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-10 00:00:00") + + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + + filter = Filter(data={**data}) + results = ClickhouseFunnel(filter, self.team).run() + + self.assertEqual(results[0]["count"], 25) + self.assertEqual(results[1]["count"], 10) + self.assertEqual(results[2]["count"], 0) + + self.assertCountEqual([str(id) for id in self._get_people_at_step(filter, 2)], ids_to_compare) + + def test_funnel_exclusions_invalid_params(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "exclusions": [{"id": "x", "type": "events", "funnel_from_step": 1, "funnel_to_step": 1},], + } + filter = Filter(data=filters) + self.assertRaises(ValidationError, lambda: ClickhouseFunnel(filter, self.team)) + + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 1, "funnel_to_step": 2}]} + ) + self.assertRaises(ValidationError, lambda: ClickhouseFunnel(filter, self.team)) + + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 2, "funnel_to_step": 1}]} + ) + self.assertRaises(ValidationError, lambda: ClickhouseFunnel(filter, self.team)) + + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 2}]} + ) + self.assertRaises(ValidationError, lambda: ClickhouseFunnel(filter, self.team)) + + def test_funnel_exclusion_no_end_event(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 1, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "exclusions": [{"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1},], + } + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event 1 + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="paid", distinct_id="person1", timestamp="2021-05-01 02:00:00") + + # event 2 + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 03:00:00") + _create_event(team=self.team, event="x", distinct_id="person2", timestamp="2021-05-01 03:30:00") + _create_event(team=self.team, event="paid", distinct_id="person2", timestamp="2021-05-01 04:00:00") + + # event 3 + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + # should be discarded, even if nothing happened after x, since within conversion window + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + # event 4 - outside conversion window + person4 = _create_person(distinct_ids=["person4"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person4", timestamp="2021-05-01 07:00:00") + _create_event(team=self.team, event="x", distinct_id="person4", timestamp="2021-05-02 08:00:00") + + result = funnel.run() + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person4.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1.uuid], + ) + + @test_with_materialized_columns(["key"]) + def test_funnel_exclusions_with_actions(self): + + sign_up_action = _create_action( + name="sign up", + team=self.team, + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "exclusions": [{"id": sign_up_action.id, "type": "actions", "funnel_from_step": 0, "funnel_to_step": 1},], + } + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event 1 + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="paid", distinct_id="person1", timestamp="2021-05-01 02:00:00") + + # event 2 + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 03:00:00") + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val"}, + timestamp="2021-05-01 03:30:00", + ) + _create_event(team=self.team, event="paid", distinct_id="person2", timestamp="2021-05-01 04:00:00") + + # event 3 + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="paid", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + result = funnel.run() + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 2) + self.assertEqual(len(result[1]["people"]), 2) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person3.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1.uuid, person3.uuid], + ) + + def test_funnel_with_denormalised_properties(self): + filters = { + "events": [ + { + "id": "user signed up", + "type": "events", + "order": 0, + "properties": [{"key": "test_prop", "value": "hi"}], + }, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "properties": [{"key": "test_prop", "value": "hi"}], + "date_to": "2020-01-14", + } + + materialize("events", "test_prop") + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + # event + _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"test_prop": "hi"}, + ) + _create_event( + team=self.team, event="paid", distinct_id="user_1", timestamp="2020-01-10T14:00:00Z", + ) + + self.assertNotIn("json", funnel.get_query().lower()) + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + + def test_advanced_funnel_multiple_exclusions_between_steps(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1}, + {"id": "insight viewed", "type": "events", "order": 2}, + {"id": "invite teammate", "type": "events", "order": 3}, + {"id": "pageview2", "type": "events", "order": 4}, + ], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "insight": INSIGHT_FUNNELS, + "exclusions": [ + {"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}, + {"id": "y", "type": "events", "funnel_from_step": 2, "funnel_to_step": 3}, + ], + } + + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="x", distinct_id="person1", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="$pageview", distinct_id="person1", timestamp="2021-05-01 03:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person1", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="y", distinct_id="person1", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person1", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="pageview2", distinct_id="person1", timestamp="2021-05-01 06:00:00") + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="y", distinct_id="person2", timestamp="2021-05-01 01:30:00") + _create_event(team=self.team, event="$pageview", distinct_id="person2", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person2", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="y", distinct_id="person2", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person2", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person2", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person2", timestamp="2021-05-01 06:00:00") + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 01:30:00") + _create_event(team=self.team, event="$pageview", distinct_id="person3", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person3", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + person4 = _create_person(distinct_ids=["person4"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person4", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="$pageview", distinct_id="person4", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person4", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person4", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="pageview2", distinct_id="person4", timestamp="2021-05-01 06:00:00") + + filter = Filter(data=filters) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person4.uuid], + ) + + filter = filter.with_data( + { + "exclusions": [ + {"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}, + {"id": "y", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}, + ], + } + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person4.uuid], + ) + + filter = filter.with_data( + { + "exclusions": [ + {"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}, + {"id": "y", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}, + ], + } + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person4.uuid], + ) + + filter = filter.with_data( + { + "exclusions": [ + {"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 4}, + {"id": "y", "type": "events", "funnel_from_step": 0, "funnel_to_step": 4}, + ], + } + ) + funnel = ClickhouseFunnel(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[4]["count"], 1) + self.assertEqual(len(result[4]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person4.uuid], + ) + + def test_funnel_with_elements_chain(self): + person1 = _create_person(distinct_ids=["test"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="test") + _create_event( + team=self.team, + event="$autocapture", + distinct_id="test", + properties={"$current_url": "http://example.com/something_else"}, + elements=[Element(tag_name="img"), Element(tag_name="svg")], + ) + + person2 = _create_person(distinct_ids=["test2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="test2") + + for tag_name in ["img", "svg"]: + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0,}, + { + "id": "$autocapture", + "name": "$autocapture", + "order": 1, + "properties": [ + {"key": "tag_name", "value": [tag_name], "operator": "exact", "type": "element"} + ], + "type": "events", + }, + ], + "insight": INSIGHT_FUNNELS, + } + + filter = Filter(data=filters) + result = ClickhouseFunnel(filter, self.team).run() + + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + self.assertEqual(result[1]["name"], "$autocapture") + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person2.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1.uuid], + ) + + def _create_groups(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"}) + + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:1", properties={}) + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:2", properties={}) + + @snapshot_clickhouse_queries + def test_funnel_aggregation_with_groups(self): + self._create_groups() + + _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + + # different person, same group, so should count as step two in funnel + _create_person(distinct_ids=["user_2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="paid", + distinct_id="user_2", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + + # same person, different group, so should count as different step 1 in funnel + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-10T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "aggregation_group_type_index": 0, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + result = ClickhouseFunnel(Filter(data=filters, team=self.team), self.team).run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + self.assertEqual(len(result[0]["people"]), 2) + + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + self.assertAlmostEqual(result[1]["average_conversion_time"], 86400) + + @snapshot_clickhouse_queries + def test_funnel_group_aggregation_with_groups_entity_filtering(self): + self._create_groups() + + _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + + # different person, same group, so should count as step two in funnel + _create_person(distinct_ids=["user_2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="paid", + distinct_id="user_2", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + + # different person, different group, so should be discarded from step 1 in funnel + _create_person(distinct_ids=["user_3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_3", + timestamp="2020-01-10T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0, "properties": {"$group_0": "org:5"}}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "aggregation_group_type_index": 0, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + result = ClickhouseFunnel(Filter(data=filters, team=self.team), self.team).run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + self.assertAlmostEqual(result[1]["average_conversion_time"], 86400) + + @snapshot_clickhouse_queries + def test_funnel_with_groups_entity_filtering(self): + self._create_groups() + + _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + _create_event( + team=self.team, + event="paid", + distinct_id="user_1", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:6"}, # different group, but doesn't matter since not aggregating by groups + ) + + # event belongs to different group, so shouldn't enter funnel + _create_person(distinct_ids=["user_2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + _create_event( + team=self.team, + event="paid", + distinct_id="user_1", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0, "properties": {"$group_0": "org:5"}}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + result = ClickhouseFunnel(Filter(data=filters, team=self.team), self.team).run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertEqual(len(result[0]["people"]), 1) + + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 1) + self.assertEqual(len(result[1]["people"]), 1) + + @snapshot_clickhouse_queries + def test_funnel_with_groups_global_filtering(self): + self._create_groups() + + person1 = _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + # second event belongs to different group, so shouldn't complete funnel + _create_event( + team=self.team, + event="paid", + distinct_id="user_1", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + + # event belongs to different group, so shouldn't enter funnel + _create_person(distinct_ids=["user_2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_2", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + _create_event( + team=self.team, + event="paid", + distinct_id="user_2", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:5"}, # same group, but different person, so not in funnel + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "properties": [{"key": "industry", "value": "finance", "type": "group", "group_type_index": 0}], + } + result = ClickhouseFunnel(Filter(data=filters, team=self.team), self.team).run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 1) + self.assertCountEqual(result[0]["people"], [person1.uuid]) + + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 0) + self.assertEqual(len(result[1]["people"]), 0) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_correlation.py b/ee/clickhouse/queries/funnels/test/test_funnel_correlation.py new file mode 100644 index 0000000000000..f5cfa76c4985b --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_correlation.py @@ -0,0 +1,996 @@ +import unittest +from uuid import uuid4 + +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels.funnel_correlation import EventContingencyTable, EventStats, FunnelCorrelation +from ee.clickhouse.queries.funnels.funnel_correlation_persons import FunnelCorrelationPersons +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.element import Element +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest, test_with_materialized_columns + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestClickhouseFunnelCorrelation(ClickhouseTestMixin, APIBaseTest): + + maxDiff = None + + def _get_people_for_event(self, filter: Filter, event_name: str, properties=None, success=True): + person_filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": event_name, "type": "events", "properties": properties}, + "funnel_correlation_person_converted": "TrUe" if success else "falSE", + } + ) + results, _ = FunnelCorrelationPersons(person_filter, self.team).run() + return [row["uuid"] for row in results] + + def test_basic_funnel_correlation_with_events(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "events", + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + for i in range(10): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="positively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(10, 20): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + + result = correlation._run()[0] + + odds_ratios = [item.pop("odds_ratio") for item in result] # type: ignore + expected_odds_ratios = [11, 1 / 11] + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual( + result, + [ + { + "event": "positively_related", + "success_count": 5, + "failure_count": 0, + # "odds_ratio": 11.0, + "correlation_type": "success", + }, + { + "event": "negatively_related", + "success_count": 0, + "failure_count": 5, + # "odds_ratio": 1 / 11, + "correlation_type": "failure", + }, + ], + ) + + self.assertEqual(len(self._get_people_for_event(filter, "positively_related")), 5) + self.assertEqual(len(self._get_people_for_event(filter, "positively_related", success=False)), 0) + self.assertEqual(len(self._get_people_for_event(filter, "negatively_related", success=False)), 5) + self.assertEqual(len(self._get_people_for_event(filter, "negatively_related")), 0) + + # Now exclude positively_related + filter = filter.with_data({"funnel_correlation_exclude_event_names": ["positively_related"]}) + correlation = FunnelCorrelation(filter, self.team) + + result = correlation._run()[0] + + odds_ratio = result[0].pop("odds_ratio") # type: ignore + expected_odds_ratio = 1 / 11 + + self.assertAlmostEqual(odds_ratio, expected_odds_ratio) + + self.assertEqual( + result, + [ + { + "event": "negatively_related", + "success_count": 0, + "failure_count": 5, + # "odds_ratio": 1 / 11, + "correlation_type": "failure", + }, + ], + ) + # Getting specific people isn't affected by exclude_events + self.assertEqual(len(self._get_people_for_event(filter, "positively_related")), 5) + self.assertEqual(len(self._get_people_for_event(filter, "positively_related", success=False)), 0) + self.assertEqual(len(self._get_people_for_event(filter, "negatively_related", success=False)), 5) + self.assertEqual(len(self._get_people_for_event(filter, "negatively_related")), 0) + + @test_with_materialized_columns(event_properties=[], person_properties=["$browser"]) + def test_basic_funnel_correlation_with_properties(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "properties", + "funnel_correlation_names": ["$browser"], + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + for i in range(10): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Positive"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(10, 20): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Negative"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + + # One Positive with failure + _create_person(distinct_ids=[f"user_fail"], team_id=self.team.pk, properties={"$browser": "Positive"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_fail", timestamp="2020-01-02T14:00:00Z", + ) + + # One Negative with success + _create_person(distinct_ids=[f"user_succ"], team_id=self.team.pk, properties={"$browser": "Negative"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_succ", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_succ", timestamp="2020-01-04T14:00:00Z", + ) + + result = correlation._run()[0] + + odds_ratios = [item.pop("odds_ratio") for item in result] # type: ignore + + # Success Total = 11, Failure Total = 11 + # + # Browser::Positive + # Success: 10 + # Failure: 1 + + # Browser::Negative + # Success: 1 + # Failure: 10 + + prior_count = 1 + expected_odds_ratios = [ + ((10 + prior_count) / (1 + prior_count)) * ((11 - 1 + prior_count) / (11 - 10 + prior_count)), + ((1 + prior_count) / (10 + prior_count)) * ((11 - 10 + prior_count) / (11 - 1 + prior_count)), + ] + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual( + result, + [ + { + "event": "$browser::Positive", + "success_count": 10, + "failure_count": 1, + # "odds_ratio": 121/4, + "correlation_type": "success", + }, + { + "event": "$browser::Negative", + "success_count": 1, + "failure_count": 10, + # "odds_ratio": 4/121, + "correlation_type": "failure", + }, + ], + ) + + def test_no_divide_by_zero_errors(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + for i in range(2): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Positive"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + # failure count for this event is 0 + _create_event( + team=self.team, event="positive", distinct_id=f"user_{i}", timestamp="2020-01-03T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(2, 4): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Negative"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + # success count for this event is 0 + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + + results = correlation._run() + self.assertFalse(results[1]) + + result = results[0] + + odds_ratios = [item.pop("odds_ratio") for item in result] # type: ignore + expected_odds_ratios = [9, 1 / 3] + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual( + result, + [ + { + "event": "positive", + "success_count": 2, + "failure_count": 0, + # "odds_ratio": 9.0, + "correlation_type": "success", + }, + { + "event": "negatively_related", + "success_count": 0, + "failure_count": 1, + # "odds_ratio": 1 / 3, + "correlation_type": "failure", + }, + ], + ) + + def test_correlation_with_properties_raises_validation_error(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "properties", + # "funnel_correlation_names": ["$browser"], missing value + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + _create_person(distinct_ids=[f"user_1"], team_id=self.team.pk, properties={"$browser": "Positive"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_1", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="rick", distinct_id=f"user_1", timestamp="2020-01-03T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_1", timestamp="2020-01-04T14:00:00Z", + ) + + with self.assertRaises(ValidationError): + correlation._run() + + filter = filter.with_data({"funnel_correlation_type": "event_with_properties"}) + # missing "funnel_correlation_event_names": ["rick"], + with self.assertRaises(ValidationError): + FunnelCorrelation(filter, self.team)._run() + + @test_with_materialized_columns(event_properties=[], person_properties=["$browser"]) + def test_correlation_with_multiple_properties(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "properties", + "funnel_correlation_names": ["$browser", "$nice"], + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + #  5 successful people with both properties + for i in range(5): + _create_person( + distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Positive", "$nice": "very"} + ) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + #  10 successful people with some different properties + for i in range(5, 15): + _create_person( + distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Positive", "$nice": "not"} + ) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + # 5 Unsuccessful people with some common properties + for i in range(15, 20): + _create_person( + distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Negative", "$nice": "smh"} + ) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + + # One Positive with failure, no $nice property + _create_person(distinct_ids=[f"user_fail"], team_id=self.team.pk, properties={"$browser": "Positive"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_fail", timestamp="2020-01-02T14:00:00Z", + ) + + # One Negative with success, no $nice property + _create_person(distinct_ids=[f"user_succ"], team_id=self.team.pk, properties={"$browser": "Negative"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_succ", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_succ", timestamp="2020-01-04T14:00:00Z", + ) + + result = correlation._run()[0] + + # Success Total = 5 + 10 + 1 = 16 + # Failure Total = 5 + 1 = 6 + # Add 1 for priors + + odds_ratios = [item.pop("odds_ratio") for item in result] # type: ignore + expected_odds_ratios = [ + (16 / 2) * ((7 - 1) / (17 - 15)), + (11 / 1) * ((7 - 0) / (17 - 10)), + (6 / 1) * ((7 - 0) / (17 - 5)), + (1 / 6) * ((7 - 5) / (17 - 0)), + (2 / 6) * ((7 - 5) / (17 - 1)), + (2 / 2) * ((7 - 1) / (17 - 1)), + ] + # (success + 1) / (failure + 1) + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + expected_result = [ + { + "event": "$browser::Positive", + "success_count": 15, + "failure_count": 1, + # "odds_ratio": 24, + "correlation_type": "success", + }, + { + "event": "$nice::not", + "success_count": 10, + "failure_count": 0, + # "odds_ratio": 11, + "correlation_type": "success", + }, + { + "event": "$nice::very", + "success_count": 5, + "failure_count": 0, + # "odds_ratio": 3.5, + "correlation_type": "success", + }, + { + "event": "$nice::smh", + "success_count": 0, + "failure_count": 5, + # "odds_ratio": 0.0196078431372549, + "correlation_type": "failure", + }, + { + "event": "$browser::Negative", + "success_count": 1, + "failure_count": 5, + # "odds_ratio": 0.041666666666666664, + "correlation_type": "failure", + }, + { + "event": "$nice::", + "success_count": 1, + "failure_count": 1, + # "odds_ratio": 0.375, + "correlation_type": "failure", + }, + ] + + self.assertEqual(result, expected_result) + + # _run property correlation with filter on all properties + filter = filter.with_data({"funnel_correlation_names": ["$all"]}) + correlation = FunnelCorrelation(filter, self.team) + + new_result = correlation._run()[0] + + odds_ratios = [item.pop("odds_ratio") for item in new_result] # type: ignore + + new_expected_odds_ratios = expected_odds_ratios[:-1] + new_expected_result = expected_result[:-1] + # When querying all properties, we don't consider properties that don't exist for part of the data + # since users aren't explicitly asking for that property. Thus, + # We discard $nice:: because it's an empty result set + + for odds, expected_odds in zip(odds_ratios, new_expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual(new_result, new_expected_result) + + filter = filter.with_data({"funnel_correlation_exclude_names": ["$browser"]}) + # search for $all but exclude $browser + correlation = FunnelCorrelation(filter, self.team) + + new_result = correlation._run()[0] + odds_ratios = [item.pop("odds_ratio") for item in new_result] # type: ignore + + new_expected_odds_ratios = expected_odds_ratios[1:4] # choosing the $nice property values + new_expected_result = expected_result[1:4] + + for odds, expected_odds in zip(odds_ratios, new_expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual(new_result, new_expected_result) + + def test_discarding_insignificant_events(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "events", + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + for i in range(10): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="positively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + if i % 10 == 0: + _create_event( + team=self.team, + event="low_sig_positively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:20:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(10, 20): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + if i % 5 == 0: + _create_event( + team=self.team, + event="low_sig_negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + + #  Total 10 positive, 10 negative + # low sig count = 1 and 2, high sig count >= 5 + # Thus, to discard the low sig count, % needs to be >= 10%, or count >= 2 + + # Discard both due to % + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.11 + FunnelCorrelation.MIN_PERSON_COUNT = 25 + result = correlation._run()[0] + self.assertEqual(len(result), 2) + + def test_events_within_conversion_window_for_correlation(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_interval": "10", + "funnel_window_interval_unit": "minute", + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "events", + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + _create_person(distinct_ids=["user_successful"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="user_successful", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="positively_related", distinct_id="user_successful", timestamp="2020-01-02T14:02:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id="user_successful", timestamp="2020-01-02T14:06:00Z", + ) + + _create_person(distinct_ids=["user_dropoff"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="user_dropoff", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="NOT_negatively_related", + distinct_id="user_dropoff", + timestamp="2020-01-02T14:15:00Z", # event happened outside conversion window + ) + + result = correlation._run()[0] + + odds_ratios = [item.pop("odds_ratio") for item in result] # type: ignore + expected_odds_ratios = [4] + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual( + result, + [ + { + "event": "positively_related", + "success_count": 1, + "failure_count": 0, + # "odds_ratio": 4.0, + "correlation_type": "success", + }, + ], + ) + + @test_with_materialized_columns(["blah", "signup_source"]) + def test_funnel_correlation_with_event_properties(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "event_with_properties", + "funnel_correlation_event_names": ["positively_related", "negatively_related"], + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + for i in range(10): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="positively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + properties={"signup_source": "facebook" if i % 4 == 0 else "email", "blah": "value_bleh"}, + ) + # source: email occurs only twice, so would be discarded from result set + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(10, 20): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + properties={"signup_source": "shazam" if i % 6 == 0 else "email"}, + ) + # source: shazam occurs only once, so would be discarded from result set + + result = correlation._run()[0] + + odds_ratios = [item.pop("odds_ratio") for item in result] # type: ignore + expected_odds_ratios = [11, 5.5, 2 / 11] + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual( + result, + [ + { + "event": "positively_related::blah::value_bleh", + "success_count": 5, + "failure_count": 0, + # "odds_ratio": 11.0, + "correlation_type": "success", + }, + { + "event": "positively_related::signup_source::facebook", + "success_count": 3, + "failure_count": 0, + # "odds_ratio": 5.5, + "correlation_type": "success", + }, + { + "event": "negatively_related::signup_source::email", + "success_count": 0, + "failure_count": 3, + # "odds_ratio": 0.18181818181818182, + "correlation_type": "failure", + }, + ], + ) + + self.assertEqual(len(self._get_people_for_event(filter, "positively_related", {"blah": "value_bleh"})), 5) + self.assertEqual( + len(self._get_people_for_event(filter, "positively_related", {"signup_source": "facebook"})), 3 + ) + self.assertEqual( + len(self._get_people_for_event(filter, "positively_related", {"signup_source": "facebook"}, False)), 0 + ) + self.assertEqual( + len(self._get_people_for_event(filter, "negatively_related", {"signup_source": "email"}, False)), 3 + ) + + def test_funnel_correlation_with_event_properties_exclusions(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "event_with_properties", + "funnel_correlation_event_names": ["positively_related"], + "funnel_correlation_event_exclude_property_names": ["signup_source"], + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + # Need more than 2 events to get a correlation + for i in range(3): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="positively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + properties={"signup_source": "facebook", "blah": "value_bleh"}, + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + # Atleast one person that fails, to ensure we get results + _create_person(distinct_ids=[f"user_fail"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_fail", timestamp="2020-01-02T14:00:00Z", + ) + + result = correlation._run()[0] + self.assertEqual( + result, + [ + { + "event": "positively_related::blah::value_bleh", + "success_count": 3, + "failure_count": 0, + "odds_ratio": 8, + "correlation_type": "success", + }, + #  missing signup_source, as expected + ], + ) + + self.assertEqual(len(self._get_people_for_event(filter, "positively_related", {"blah": "value_bleh"})), 3) + + # If you search for persons with a specific property, even if excluded earlier, you should get them + self.assertEqual( + len(self._get_people_for_event(filter, "positively_related", {"signup_source": "facebook"})), 3 + ) + + @test_with_materialized_columns(["$event_type", "signup_source"]) + def test_funnel_correlation_with_event_properties_autocapture(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "event_with_properties", + "funnel_correlation_event_names": ["$autocapture"], + } + + filter = Filter(data=filters) + correlation = FunnelCorrelation(filter, self.team) + + # Need a minimum of 3 hits to get a correlation result + for i in range(6): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="$autocapture", + distinct_id=f"user_{i}", + elements=[Element(nth_of_type=1, nth_child=0, tag_name="a", href="/movie")], + timestamp="2020-01-03T14:00:00Z", + properties={"signup_source": "email", "$event_type": "click"}, + ) + # Test two different types of autocapture elements, with different counts, so we can accurately test results + if i % 2 == 0: + _create_event( + team=self.team, + event="$autocapture", + distinct_id=f"user_{i}", + elements=[Element(nth_of_type=1, nth_child=0, tag_name="button", text="Pay $10")], + timestamp="2020-01-03T14:00:00Z", + properties={"signup_source": "facebook", "$event_type": "submit"}, + ) + + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + # Atleast one person that fails, to ensure we get results + _create_person(distinct_ids=[f"user_fail"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_fail", timestamp="2020-01-02T14:00:00Z", + ) + + result = correlation._run()[0] + + # $autocapture results only return elements chain + self.assertEqual( + result, + [ + { + "event": '$autocapture::elements_chain::click__~~__a:href="/movie"nth-child="0"nth-of-type="1"', + "success_count": 6, + "failure_count": 0, + "odds_ratio": 14.0, + "correlation_type": "success", + }, + { + "event": '$autocapture::elements_chain::submit__~~__button:nth-child="0"nth-of-type="1"text="Pay $10"', + "success_count": 3, + "failure_count": 0, + "odds_ratio": 2.0, + "correlation_type": "success", + }, + ], + ) + + self.assertEqual(len(self._get_people_for_event(filter, "$autocapture", {"signup_source": "facebook"})), 3) + self.assertEqual(len(self._get_people_for_event(filter, "$autocapture", {"$event_type": "click"})), 6) + self.assertEqual( + len( + self._get_people_for_event( + filter, + "$autocapture", + [ + {"key": "tag_name", "operator": "exact", "type": "element", "value": "button"}, + {"key": "text", "operator": "exact", "type": "element", "value": "Pay $10"}, + ], + ) + ), + 3, + ) + self.assertEqual( + len( + self._get_people_for_event( + filter, + "$autocapture", + [ + {"key": "tag_name", "operator": "exact", "type": "element", "value": "a"}, + {"key": "href", "operator": "exact", "type": "element", "value": "/movie"}, + ], + ) + ), + 6, + ) + + +class TestCorrelationFunctions(unittest.TestCase): + def test_are_results_insignificant(self): + # Same setup as above test: test_discarding_insignificant_events + contingency_tables = [ + EventContingencyTable( + event="negatively_related", + visited=EventStats(success_count=0, failure_count=5), + success_total=10, + failure_total=10, + ), + EventContingencyTable( + event="positively_related", + visited=EventStats(success_count=5, failure_count=0), + success_total=10, + failure_total=10, + ), + EventContingencyTable( + event="low_sig_negatively_related", + visited=EventStats(success_count=0, failure_count=2), + success_total=10, + failure_total=10, + ), + EventContingencyTable( + event="low_sig_positively_related", + visited=EventStats(success_count=1, failure_count=0), + success_total=10, + failure_total=10, + ), + ] + + # Discard both low_sig due to % + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.11 + FunnelCorrelation.MIN_PERSON_COUNT = 25 + result = [ + 1 + for contingency_table in contingency_tables + if not FunnelCorrelation.are_results_insignificant(contingency_table) + ] + self.assertEqual(len(result), 2) + + # Discard one low_sig due to % + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.051 + FunnelCorrelation.MIN_PERSON_COUNT = 25 + result = [ + 1 + for contingency_table in contingency_tables + if not FunnelCorrelation.are_results_insignificant(contingency_table) + ] + self.assertEqual(len(result), 3) + + # Discard both due to count + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.5 + FunnelCorrelation.MIN_PERSON_COUNT = 3 + result = [ + 1 + for contingency_table in contingency_tables + if not FunnelCorrelation.are_results_insignificant(contingency_table) + ] + self.assertEqual(len(result), 2) + + # Discard one due to count + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.5 + FunnelCorrelation.MIN_PERSON_COUNT = 2 + result = [ + 1 + for contingency_table in contingency_tables + if not FunnelCorrelation.are_results_insignificant(contingency_table) + ] + self.assertEqual(len(result), 3) + + # Discard everything due to % + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.5 + FunnelCorrelation.MIN_PERSON_COUNT = 100 + result = [ + 1 + for contingency_table in contingency_tables + if not FunnelCorrelation.are_results_insignificant(contingency_table) + ] + self.assertEqual(len(result), 0) + + # Discard everything due to count + FunnelCorrelation.MIN_PERSON_PERCENTAGE = 0.5 + FunnelCorrelation.MIN_PERSON_COUNT = 6 + result = [ + 1 + for contingency_table in contingency_tables + if not FunnelCorrelation.are_results_insignificant(contingency_table) + ] + self.assertEqual(len(result), 0) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_correlation_persons.py b/ee/clickhouse/queries/funnels/test/test_funnel_correlation_persons.py new file mode 100644 index 0000000000000..c991c8537cfd9 --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_correlation_persons.py @@ -0,0 +1,210 @@ +from uuid import uuid4 + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels.funnel_correlation_persons import FunnelCorrelationPersons +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models import Cohort, Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest, test_with_materialized_columns + +FORMAT_TIME = "%Y-%m-%d 00:00:00" +MAX_STEP_COLUMN = 0 +COUNT_COLUMN = 1 +PERSON_ID_COLUMN = 2 + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestClickhouseFunnelCorrelationPersons(ClickhouseTestMixin, APIBaseTest): + + maxDiff = None + + def test_basic_funnel_correlation_with_events(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "events", + } + + filter = Filter(data=filters) + + success_target_persons = [] + failure_target_persons = [] + + for i in range(10): + person = _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="positively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + success_target_persons.append(str(person.uuid)) + + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(10, 20): + person = _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + failure_target_persons.append(str(person.uuid)) + + # One positively_related as failure + person_fail = _create_person(distinct_ids=[f"user_fail"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_fail", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="positively_related", distinct_id=f"user_fail", timestamp="2020-01-03T14:00:00Z", + ) + + # One negatively_related as success + person_succ = _create_person(distinct_ids=[f"user_succ"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_succ", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="negatively_related", distinct_id=f"user_succ", timestamp="2020-01-03T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_succ", timestamp="2020-01-04T14:00:00Z", + ) + + # TESTS + + # test positively_related successes + filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": "positively_related", "type": "events"}, + "funnel_correlation_person_converted": "TrUe", + } + ) + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], success_target_persons) + + # test negatively_related failures + filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": "negatively_related", "type": "events"}, + "funnel_correlation_person_converted": "falsE", + } + ) + + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], failure_target_persons) + + # test positively_related failures + filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": "positively_related", "type": "events"}, + "funnel_correlation_person_converted": "False", + } + ) + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], [str(person_fail.uuid)]) + + # test negatively_related successes + filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": "negatively_related", "type": "events"}, + "funnel_correlation_person_converted": "trUE", + } + ) + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], [str(person_succ.uuid)]) + + # test all positively_related + filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": "positively_related", "type": "events"}, + "funnel_correlation_person_converted": None, + } + ) + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], [*success_target_persons, str(person_fail.uuid)]) + + # test all negatively_related + filter = filter.with_data( + { + "funnel_correlation_person_entity": {"id": "negatively_related", "type": "events"}, + "funnel_correlation_person_converted": None, + } + ) + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], [*failure_target_persons, str(person_succ.uuid)]) + + def test_people_arent_returned_multiple_times(self): + + person = _create_person(distinct_ids=[f"user_1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_1", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="positively_related", distinct_id=f"user_1", timestamp="2020-01-03T14:00:00Z", + ) + # duplicate event + _create_event( + team=self.team, event="positively_related", distinct_id=f"user_1", timestamp="2020-01-03T15:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_1", timestamp="2020-01-04T14:00:00Z", + ) + + filter = Filter( + data={ + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "events", + "funnel_correlation_person_entity": {"id": "positively_related", "type": "events"}, + "funnel_correlation_person_converted": "TrUe", + } + ) + results, has_more_results = FunnelCorrelationPersons(filter, self.team).run() + + self.assertFalse(has_more_results) + self.assertCountEqual([val["uuid"] for val in results], [str(person.uuid)]) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_persons.py b/ee/clickhouse/queries/funnels/test/test_funnel_persons.py new file mode 100644 index 0000000000000..b2f5c95b7302e --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_persons.py @@ -0,0 +1,376 @@ +from uuid import uuid4 + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels.funnel import ClickhouseFunnel +from ee.clickhouse.queries.funnels.funnel_persons import ClickhouseFunnelPersons +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models import Cohort, Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest, test_with_materialized_columns + +FORMAT_TIME = "%Y-%m-%d 00:00:00" +MAX_STEP_COLUMN = 0 +COUNT_COLUMN = 1 +PERSON_ID_COLUMN = 2 + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelPersons(ClickhouseTestMixin, APIBaseTest): + def _create_sample_data_multiple_dropoffs(self): + for i in range(5): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-05 00:00:00") + + for i in range(5, 15): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + + for i in range(15, 35): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + + def _create_browser_breakdown_events(self): + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk, properties={"$country": "PL"}) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk, properties={"$country": "EE"}) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + return person1, person2 + + def test_first_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertEqual(35, len(results)) + + def test_last_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 3, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertEqual(5, len(results)) + + def test_second_step_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": -2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertEqual(20, len(results)) + + def test_last_step_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": -3, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertEqual(10, len(results)) + + def _create_sample_data(self): + for i in range(110): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-05 00:00:00") + + def test_basic_offset(self): + self._create_sample_data() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + + filter = Filter(data=data) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertEqual(100, len(results)) + + filter_offset = Filter(data={**data, "offset": 100,}) + results, _ = ClickhouseFunnelPersons(filter_offset, self.team).run() + self.assertEqual(10, len(results)) + + def test_steps_with_custom_steps_parameter_are_equivalent_to_funnel_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + base_filter = Filter(data=data) + + parameters = [ + #  funnel_step, custom_steps, expected_results + (1, [1, 2, 3], 35), + (2, [2, 3], 15), + (3, [3], 5), + (-2, [1], 20), + (-3, [2], 10), + ] + + for funnel_step, custom_steps, expected_count in parameters: + filter = base_filter.with_data({"funnel_step": funnel_step}) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + + new_filter = base_filter.with_data({"funnel_custom_steps": custom_steps}) + new_results = ClickhouseFunnelPersons(new_filter, self.team)._exec_query() + + self.assertEqual(new_results, results) + self.assertEqual(len(results), expected_count) + + def test_steps_with_custom_steps_parameter_where_funnel_step_equivalence_isnt_possible(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + base_filter = Filter(data=data) + + parameters = [ + # custom_steps, expected_results + ([1, 2], 30), + ([1, 3], 25), + ([3, 1], 25), + ([1, 3, 3, 1], 25), + ] + + for custom_steps, expected_count in parameters: + new_filter = base_filter.with_data({"funnel_custom_steps": custom_steps}) + new_results = ClickhouseFunnelPersons(new_filter, self.team)._exec_query() + + self.assertEqual(len(new_results), expected_count) + + def test_steps_with_custom_steps_parameter_overrides_funnel_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 1, # means custom steps = [1,2,3] + "funnel_custom_steps": [3], + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + + results = ClickhouseFunnelPersons(Filter(data=data), self.team)._exec_query() + + self.assertEqual(len(results), 5) + + @test_with_materialized_columns(["$browser"]) + def test_first_step_breakdowns(self): + person1, person2 = self._create_browser_breakdown_events() + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "interval": "day", + "funnel_window_days": 7, + "funnel_step": 1, + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "breakdown_type": "event", + "breakdown": "$browser", + } + ) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + + self.assertCountEqual([val[0] for val in results], [person1.uuid, person2.uuid]) + + results = ClickhouseFunnelPersons( + filter.with_data({"funnel_step_breakdown": "Chrome"}), self.team + )._exec_query() + + self.assertCountEqual([val[0] for val in results], [person1.uuid]) + + results = ClickhouseFunnelPersons( + filter.with_data({"funnel_step_breakdown": "Safari"}), self.team + )._exec_query() + self.assertCountEqual([val[0] for val in results], [person2.uuid]) + + results = ClickhouseFunnelPersons( + filter.with_data({"funnel_step_breakdown": "Safari, Chrome"}), self.team + )._exec_query() + self.assertCountEqual([val[0] for val in results], [person2.uuid, person1.uuid]) + + @test_with_materialized_columns(person_properties=["$country"]) + def test_first_step_breakdown_person(self): + person1, person2 = self._create_browser_breakdown_events() + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "interval": "day", + "funnel_window_days": 7, + "funnel_step": 1, + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "breakdown_type": "person", + "breakdown": "$country", + } + ) + + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertCountEqual([val[0] for val in results], [person1.uuid, person2.uuid]) + + results = ClickhouseFunnelPersons(filter.with_data({"funnel_step_breakdown": "EE"}), self.team)._exec_query() + self.assertCountEqual([val[0] for val in results], [person2.uuid]) + + # Check custom_steps give same answers for breakdowns + custom_step_results = ClickhouseFunnelPersons( + filter.with_data({"funnel_step_breakdown": "EE", "funnel_custom_steps": [1, 2, 3]}), self.team + )._exec_query() + self.assertEqual(results, custom_step_results) + + results = ClickhouseFunnelPersons(filter.with_data({"funnel_step_breakdown": "PL"}), self.team)._exec_query() + self.assertCountEqual([val[0] for val in results], [person1.uuid]) + + # Check custom_steps give same answers for breakdowns + custom_step_results = ClickhouseFunnelPersons( + filter.with_data({"funnel_step_breakdown": "PL", "funnel_custom_steps": [1, 2, 3]}), self.team + )._exec_query() + self.assertEqual(results, custom_step_results) + + @test_with_materialized_columns(["$browser"], verify_no_jsonextract=False) + def test_funnel_cohort_breakdown_persons(self): + person = _create_person(distinct_ids=[f"person1"], team_id=self.team.pk, properties={"key": "value"}) + _create_event( + team=self.team, event="sign up", distinct_id=f"person1", properties={}, timestamp="2020-01-02T12:00:00Z", + ) + cohort = Cohort.objects.create( + team=self.team, + name="test_cohort", + groups=[{"properties": [{"key": "key", "value": "value", "type": "person"}]}], + ) + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "funnel_step": 1, + "breakdown_type": "cohort", + "breakdown": [cohort.pk], + } + filter = Filter(data=filters) + results = ClickhouseFunnelPersons(filter, self.team)._exec_query() + self.assertEqual(results[0][0], person.uuid) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_strict.py b/ee/clickhouse/queries/funnels/test/test_funnel_strict.py new file mode 100644 index 0000000000000..a0994e371adf7 --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_strict.py @@ -0,0 +1,491 @@ +from datetime import datetime, timedelta +from uuid import uuid4 + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels.funnel_strict import ClickhouseFunnelStrict +from ee.clickhouse.queries.funnels.funnel_strict_persons import ClickhouseFunnelStrictPersons +from ee.clickhouse.queries.funnels.test.breakdown_cases import funnel_breakdown_test_factory +from ee.clickhouse.queries.funnels.test.conversion_time_cases import funnel_conversion_time_test_factory +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.action import Action +from posthog.models.action_step import ActionStep +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + +FORMAT_TIME = "%Y-%m-%d 00:00:00" + + +def _create_action(**kwargs): + team = kwargs.pop("team") + name = kwargs.pop("name") + properties = kwargs.pop("properties", {}) + action = Action.objects.create(team=team, name=name) + ActionStep.objects.create(action=action, event=name, properties=properties) + return action + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelStrictStepsBreakdown(ClickhouseTestMixin, funnel_breakdown_test_factory(ClickhouseFunnelStrict, ClickhouseFunnelStrictPersons, _create_event, _create_action, _create_person)): # type: ignore + + maxDiff = None + + def test_strict_breakdown_events_with_multiple_properties(self): + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnelStrict(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="blah", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T14:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T13:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + result = funnel.run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [], + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Chrome"), []) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [], + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [], + "count": 1, + "type": "events", + "average_conversion_time": 3600, + "median_conversion_time": 3600, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person2.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Safari"), [person2.uuid]) + + +class TestFunnelStrictStepsConversionTime(ClickhouseTestMixin, funnel_conversion_time_test_factory(ClickhouseFunnelStrict, ClickhouseFunnelStrictPersons, _create_event, _create_person)): # type: ignore + + maxDiff = None + pass + + +class TestFunnelStrictSteps(ClickhouseTestMixin, APIBaseTest): + + maxDiff = None + + def _get_people_at_step(self, filter, funnel_step, breakdown_value=None): + person_filter = filter.with_data({"funnel_step": funnel_step, "funnel_step_breakdown": breakdown_value}) + result = ClickhouseFunnelStrictPersons(person_filter, self.team)._exec_query() + return [row[0] for row in result] + + def test_basic_strict_funnel(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "events": [ + {"id": "user signed up", "order": 0}, + {"id": "$pageview", "order": 1}, + {"id": "insight viewed", "order": 2}, + ], + } + ) + + funnel = ClickhouseFunnelStrict(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview1") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview1") + + person3_stopped_after_insight_view = _create_person( + distinct_ids=["stopped_after_insightview"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview") + + person4_stopped_after_insight_view_not_strict_order = _create_person( + distinct_ids=["stopped_after_insightview2"], team_id=self.team.pk + ) + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview2") + + person5_stopped_after_insight_view_random = _create_person( + distinct_ids=["stopped_after_insightview3"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview3") + + person6 = _create_person(distinct_ids=["person6"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person6") + _create_event(team=self.team, event="user signed up", distinct_id="person6") + _create_event(team=self.team, event="blaah blaa", distinct_id="person6") + _create_event(team=self.team, event="$pageview", distinct_id="person6") + + person7 = _create_person(distinct_ids=["person7"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person7") + _create_event(team=self.team, event="user signed up", distinct_id="person7") + _create_event(team=self.team, event="$pageview", distinct_id="person7") + _create_event(team=self.team, event="insight viewed", distinct_id="person7") + _create_event(team=self.team, event="blaah blaa", distinct_id="person7") + + person8_didnot_signup = _create_person(distinct_ids=["stopped_after_insightview6"], team_id=self.team.pk) + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview6") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview6") + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[2]["name"], "insight viewed") + self.assertEqual(result[0]["count"], 7) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_not_strict_order.uuid, + person5_stopped_after_insight_view_random.uuid, + person6.uuid, + person7.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person3_stopped_after_insight_view.uuid, person7.uuid,], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), [person7.uuid], + ) + + def test_advanced_strict_funnel(self): + + sign_up_action = _create_action( + name="sign up", + team=self.team, + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + + view_action = _create_action( + name="pageview", + team=self.team, + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 2}, + ], + "actions": [ + {"id": sign_up_action.id, "math": "dau", "order": 1}, + {"id": view_action.id, "math": "wau", "order": 3}, + ], + "insight": INSIGHT_FUNNELS, + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnelStrict(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview1") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview1") + + person3_stopped_after_insight_view = _create_person( + distinct_ids=["stopped_after_insightview"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview") + _create_event( + team=self.team, event="sign up", distinct_id="stopped_after_insightview", properties={"key": "val"} + ) + _create_event( + team=self.team, event="sign up", distinct_id="stopped_after_insightview", properties={"key": "val2"} + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview") + + person4 = _create_person(distinct_ids=["person4"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person4") + _create_event(team=self.team, event="user signed up", distinct_id="person4") + _create_event(team=self.team, event="sign up", distinct_id="person4", properties={"key": "val"}) + _create_event(team=self.team, event="$pageview", distinct_id="person4", properties={"key": "val"}) + _create_event(team=self.team, event="blaah blaa", distinct_id="person4") + + person5 = _create_person(distinct_ids=["person5"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person5") + _create_event(team=self.team, event="user signed up", distinct_id="person5") + _create_event(team=self.team, event="sign up", distinct_id="person5", properties={"key": "val"}) + _create_event(team=self.team, event="$pageview", distinct_id="person5") + _create_event(team=self.team, event="blaah blaa", distinct_id="person5") + + person6 = _create_person(distinct_ids=["person6"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person6") + _create_event(team=self.team, event="user signed up", distinct_id="person6") + _create_event(team=self.team, event="sign up", distinct_id="person6", properties={"key": "val"}) + _create_event(team=self.team, event="$pageview", distinct_id="person6") + _create_event(team=self.team, event="pageview", distinct_id="person6", properties={"key": "val1"}) + + person7 = _create_person(distinct_ids=["person7"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person7") + _create_event(team=self.team, event="user signed up", distinct_id="person7") + _create_event(team=self.team, event="sign up", distinct_id="person7", properties={"key": "val"}) + _create_event(team=self.team, event="$pageview", distinct_id="person7") + _create_event(team=self.team, event="user signed up", distinct_id="person7") + _create_event(team=self.team, event="pageview", distinct_id="person7", properties={"key": "val"}) + + person8 = _create_person(distinct_ids=["person8"], team_id=self.team.pk) + _create_event(team=self.team, event="blaah blaa", distinct_id="person8") + _create_event(team=self.team, event="user signed up", distinct_id="person8") + _create_event(team=self.team, event="user signed up", distinct_id="person8") + _create_event(team=self.team, event="sign up", distinct_id="person8", properties={"key": "val"}) + _create_event(team=self.team, event="$pageview", distinct_id="person8") + _create_event(team=self.team, event="pageview", distinct_id="person8", properties={"key": "val"}) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "sign up") + self.assertEqual(result[2]["name"], "$pageview") + self.assertEqual(result[3]["name"], "pageview") + self.assertEqual(result[0]["count"], 8) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + person4.uuid, + person5.uuid, + person6.uuid, + person7.uuid, + person8.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [ + person3_stopped_after_insight_view.uuid, + person4.uuid, + person5.uuid, + person6.uuid, + person7.uuid, + person8.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), + [person4.uuid, person5.uuid, person6.uuid, person7.uuid, person8.uuid,], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 4), [person8.uuid,], + ) + + def test_basic_strict_funnel_conversion_times(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "events": [ + {"id": "user signed up", "order": 0}, + {"id": "$pageview", "order": 1}, + {"id": "insight viewed", "order": 2}, + ], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + } + ) + + funnel = ClickhouseFunnelStrict(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2021-05-02 00:00:00" + ) + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_pageview1", + timestamp="2021-05-02 00:00:00", + ) + _create_event( + team=self.team, event="$pageview", distinct_id="stopped_after_pageview1", timestamp="2021-05-02 01:00:00" + ) + + person3_stopped_after_insight_view = _create_person( + distinct_ids=["stopped_after_insightview"], team_id=self.team.pk + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_insightview", + timestamp="2021-05-02 00:00:00", + ) + _create_event( + team=self.team, event="$pageview", distinct_id="stopped_after_insightview", timestamp="2021-05-02 02:00:00" + ) + _create_event( + team=self.team, + event="insight viewed", + distinct_id="stopped_after_insightview", + timestamp="2021-05-02 04:00:00", + ) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[2]["name"], "insight viewed") + self.assertEqual(result[0]["count"], 3) + + self.assertEqual(result[1]["average_conversion_time"], 5400) + # 1 hour for Person 2, 2 hours for Person 3, average = 1.5 hours + + self.assertEqual(result[2]["average_conversion_time"], 7200) + # 2 hours for Person 3 + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [person2_stopped_after_one_pageview.uuid, person3_stopped_after_insight_view.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), [person3_stopped_after_insight_view.uuid], + ) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_strict_persons.py b/ee/clickhouse/queries/funnels/test/test_funnel_strict_persons.py new file mode 100644 index 0000000000000..389ba5086c53f --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_strict_persons.py @@ -0,0 +1,116 @@ +from uuid import uuid4 + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels.funnel_strict_persons import ClickhouseFunnelStrictPersons +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + +FORMAT_TIME = "%Y-%m-%d 00:00:00" + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelStrictStepsPersons(ClickhouseTestMixin, APIBaseTest): + def _create_sample_data_multiple_dropoffs(self): + for i in range(5): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step fake", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-05 00:00:00") + + for i in range(5, 15): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + + for i in range(15, 35): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + + def test_first_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelStrictPersons(filter, self.team).run() + self.assertEqual(35, len(results)) + + def test_second_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelStrictPersons(filter, self.team).run() + self.assertEqual(10, len(results)) + + def test_second_step_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": -2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelStrictPersons(filter, self.team).run() + self.assertEqual(25, len(results)) + + def test_third_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 3, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelStrictPersons(filter, self.team).run() + self.assertEqual(0, len(results)) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_time_to_convert.py b/ee/clickhouse/queries/funnels/test/test_funnel_time_to_convert.py new file mode 100644 index 0000000000000..74dad02c356fb --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_time_to_convert.py @@ -0,0 +1,362 @@ +import unittest +from uuid import uuid4 + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels import ClickhouseFunnel, ClickhouseFunnelStrict, ClickhouseFunnelUnordered +from ee.clickhouse.queries.funnels.funnel_time_to_convert import ClickhouseFunnelTimeToConvert +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS, TRENDS_LINEAR +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + +FORMAT_TIME = "%Y-%m-%d %H:%M:%S" +FORMAT_TIME_DAY_END = "%Y-%m-%d 23:59:59" + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelTrends(ClickhouseTestMixin, APIBaseTest): + maxDiff = None + + def test_auto_bin_count_single_step(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_from_step": 0, + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnel) + results = funnel_trends.run() + + # Autobinned using the minimum time to convert, maximum time to convert, and sample count + self.assertEqual( + results, + { + "bins": [ + (2220.0, 2), # Reached step 1 from step 0 in at least 2200 s but less than 29_080 s - users A and B + (29080.0, 0), # Analogous to above, just an interval (in this case 26_880 s) up - no users + (55940.0, 0), # Same as above + (82800.0, 1), # Reached step 1 from step 0 in at least 82_800 s but less than 109_680 s - user C + ], + "average_conversion_time": 29_540, + }, + ) + + @unittest.skip("Wait for bug to be resolved") + def test_auto_bin_count_single_step_duplicate_events(self): + # demonstrates existing CH bug. Current patch is to remove negative times from consideration + # Reference on what happens: https://github.com/ClickHouse/ClickHouse/issues/26580 + + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_from_step": 0, + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step one", "order": 1}, + {"id": "step one", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnel) + results = funnel_trends.run() + + # Autobinned using the minimum time to convert, maximum time to convert, and sample count + self.assertEqual( + results, + { + "bins": [ + (2220.0, 2), # Reached step 1 from step 0 in at least 2200 s but less than 29_080 s - users A and B + (29080.0, 0), # Analogous to above, just an interval (in this case 26_880 s) up - no users + (55940.0, 0), # Same as above + (82800.0, 1), # Reached step 1 from step 0 in at least 82_800 s but less than 109_680 s - user C + ], + "average_conversion_time": 29_540, + }, + ) + + def test_custom_bin_count_single_step(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_from_step": 0, + "funnel_to_step": 1, + "funnel_window_days": 7, + "bin_count": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnel) + results = funnel_trends.run() + + # 7 bins, autoscaled to work best with minimum time to convert and maximum time to convert at hand + self.assertEqual( + results, + { + "bins": [ + (2220.0, 2), # Reached step 1 from step 0 in at least 2200 s but less than 13_732 s - users A and B + (13732.0, 0), # Analogous to above, just an interval (in this case 13_732 s) up - no users + (25244.0, 0), # And so on + (36756.0, 0), + (48268.0, 0), + (59780.0, 0), + (71292.0, 1), # Reached step 1 from step 0 in at least 71_292 s but less than 82_804 s - user C + (82804.0, 0), + ], + "average_conversion_time": 29_540, + }, + ) + + def test_auto_bin_count_total(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + # Converted from 0 to 2 in 10_800 s + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnel) + results = funnel_trends.run() + + self.assertEqual( + results, + { + "bins": [ + (10800.0, 1), # Reached step 2 from step 0 in at least 10_800 s but less than 10_860 s - user A + (10860.0, 0), # Analogous to above, just an interval (in this case 60 s) up - no users + (10920.0, 0), # And so on + (10980.0, 0), + ], + "average_conversion_time": 10_800, + }, + ) + + # Let's verify that behavior with steps unspecified is the same as when first and last steps specified + funnel_trends_steps_specified = ClickhouseFunnelTimeToConvert( + Filter(data={**filter._data, "funnel_from_step": 0, "funnel_to_step": 2,}), self.team, ClickhouseFunnel + ) + results_steps_specified = funnel_trends_steps_specified.run() + + self.assertEqual(results, results_steps_specified) + + def test_basic_unordered(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + # Converted from 0 to 1 in 7200 s + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_from_step": 0, + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnelUnordered) + results = funnel_trends.run() + + # Autobinned using the minimum time to convert, maximum time to convert, and sample count + self.assertEqual( + results, + { + "bins": [ + (2220.0, 2), # Reached step 1 from step 0 in at least 2200 s but less than 29_080 s - users A and B + (29080.0, 0), # Analogous to above, just an interval (in this case 26_880 s) up - no users + (55940.0, 0), # Same as above + (82800.0, 1), # Reached step 1 from step 0 in at least 82_800 s but less than 109_680 s - user C + ], + "average_conversion_time": 29540, + }, + ) + + def test_basic_strict(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + _create_person(distinct_ids=["user d"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + _create_event(event="blah", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:38:00") + _create_event(event="step three", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:39:00") + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + _create_event(event="step one", distinct_id="user d", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="blah", distinct_id="user d", team=self.team, timestamp="2021-06-12 07:00:00") + # Blah cancels conversion + _create_event(event="step two", distinct_id="user d", team=self.team, timestamp="2021-06-12 09:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_from_step": 0, + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTimeToConvert(filter, self.team, ClickhouseFunnelStrict) + results = funnel_trends.run() + + # Autobinned using the minimum time to convert, maximum time to convert, and sample count + self.assertEqual( + results, + { + "bins": [ + (2220.0, 2), # Reached step 1 from step 0 in at least 2200 s but less than 29_080 s - users A and B + (29080.0, 0), # Analogous to above, just an interval (in this case 26_880 s) up - no users + (55940.0, 0), # Same as above + (82800.0, 1), # Reached step 1 from step 0 in at least 82_800 s but less than 109_680 s - user C + ], + "average_conversion_time": 29540, + }, + ) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_trends.py b/ee/clickhouse/queries/funnels/test/test_funnel_trends.py new file mode 100644 index 0000000000000..eb3508d87d474 --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_trends.py @@ -0,0 +1,1324 @@ +from datetime import date, datetime, timedelta +from uuid import uuid4 + +import pytz +from freezegun.api import freeze_time + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels import ClickhouseFunnel, ClickhouseFunnelStrict, ClickhouseFunnelUnordered +from ee.clickhouse.queries.funnels.funnel_trends import ClickhouseFunnelTrends +from ee.clickhouse.queries.funnels.funnel_trends_persons import ClickhouseFunnelTrendsPersons +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS, TRENDS_LINEAR +from posthog.models.cohort import Cohort +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + +FORMAT_TIME = "%Y-%m-%d %H:%M:%S" +FORMAT_TIME_DAY_END = "%Y-%m-%d 23:59:59" + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelTrends(ClickhouseTestMixin, APIBaseTest): + maxDiff = None + + def _get_people_at_step(self, filter, entrance_period_start, drop_off, funnel_class=ClickhouseFunnel): + person_filter = filter.with_data({"entrance_period_start": entrance_period_start, "drop_off": drop_off}) + return ClickhouseFunnelTrendsPersons(person_filter, self.team, funnel_class).run() + + def _create_sample_data(self): + # five people, three steps + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + _create_person(distinct_ids=["user_three"], team=self.team) + _create_person(distinct_ids=["user_four"], team=self.team) + _create_person(distinct_ids=["user_five"], team=self.team) + _create_person(distinct_ids=["user_six"], team=self.team) + _create_person(distinct_ids=["user_seven"], team=self.team) + _create_person(distinct_ids=["user_eight"], team=self.team) + + # user_one, funnel steps: one, two three + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_two, funnel steps: one, two + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id="user_two", team=self.team, timestamp="2021-05-04 00:00:00") + + # user_three, funnel steps: one + _create_event(event="step one", distinct_id="user_three", team=self.team, timestamp="2021-05-06 00:00:00") + + # user_four, funnel steps: none + _create_event(event="step none", distinct_id="user_four", team=self.team, timestamp="2021-05-06 00:00:00") + + # user_five, funnel steps: one, two, three in the same day + _create_event(event="step one", distinct_id="user_five", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_five", team=self.team, timestamp="2021-05-01 02:00:00") + _create_event(event="step three", distinct_id="user_five", team=self.team, timestamp="2021-05-01 03:00:00") + + # user_six, funnel steps: one, two three + _create_event(event="step one", distinct_id="user_six", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_six", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id="user_six", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_seven, funnel steps: one, two + _create_event(event="step one", distinct_id="user_seven", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id="user_seven", team=self.team, timestamp="2021-05-04 00:00:00") + + def test_no_event_in_period(self): + _create_person(distinct_ids=["user a"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-06 21:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + funnel_trends = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel) + results = funnel_trends._exec_query() + formatted_results = funnel_trends._format_results(results) + + self.assertEqual(len(results), 7) + self.assertEqual(formatted_results[0]["days"][0], "2021-06-07") + + def test_only_one_user_reached_one_step(self): + _create_person(distinct_ids=["user a"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-07 19:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + funnel_trends = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel) + results = funnel_trends._exec_query() + + self.assertEqual( + results, + [ + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 1, + "timestamp": datetime(2021, 6, 7, 0, 0).replace(tzinfo=pytz.UTC), + }, + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 0, + "timestamp": datetime(2021, 6, 8, 0, 0).replace(tzinfo=pytz.UTC), + }, + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 0, + "timestamp": datetime(2021, 6, 9, 0, 0).replace(tzinfo=pytz.UTC), + }, + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 0, + "timestamp": datetime(2021, 6, 10, 0, 0).replace(tzinfo=pytz.UTC), + }, + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 0, + "timestamp": datetime(2021, 6, 11, 0, 0).replace(tzinfo=pytz.UTC), + }, + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 0, + "timestamp": datetime(2021, 6, 12, 0, 0).replace(tzinfo=pytz.UTC), + }, + { + "reached_to_step_count": 0, + "is_period_final": True, + "conversion_rate": 0, + "reached_from_step_count": 0, + "timestamp": datetime(2021, 6, 13, 0, 0).replace(tzinfo=pytz.UTC), + }, + ], + ) + + # 1 user who dropped off starting 2021-06-07 + funnel_trends_persons_existent_dropped_off_results, _ = self._get_people_at_step( + filter, "2021-06-07 00:00:00", True + ) + + self.assertEqual( + len(funnel_trends_persons_existent_dropped_off_results), 1, + ) + self.assertEqual( + [person["distinct_ids"] for person in funnel_trends_persons_existent_dropped_off_results], [["user a"]], + ) + + # No users converted 2021-06-07 + funnel_trends_persons_nonexistent_converted_results, _ = self._get_people_at_step( + filter, "2021-06-07 00:00:00", False + ) + + self.assertEqual( + len(funnel_trends_persons_nonexistent_converted_results), 0, + ) + + # No users dropped off 2021-06-08 + funnel_trends_persons_nonexistent_converted_results, _ = self._get_people_at_step( + filter, "2021-06-08 00:00:00", True + ) + + self.assertEqual( + len(funnel_trends_persons_nonexistent_converted_results), 0, + ) + + # minute, hour, day, week, month + def test_hour_interval(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "hour", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + self.assertEqual(len(results), 145) + + def test_day_interval(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + _create_person(distinct_ids=["user_one"], team=self.team) + + # full run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + self.assertEqual(7, len(results)) + + persons, _ = self._get_people_at_step(filter, "2021-05-01 00:00:00", False) + + self.assertEqual( + [person["distinct_ids"] for person in persons], [["user_one"]], + ) + + def test_week_interval(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "week", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + + _create_person(distinct_ids=["user_one"], team=self.team) + + # full run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + persons, _ = self._get_people_at_step(filter, "2021-04-25 00:00:00", False) + + self.assertEqual(2, len(results)) + self.assertEqual( + [person["distinct_ids"] for person in persons], [["user_one"]], + ) + + def test_month_interval(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "month", + "date_from": "2020-01-01 00:00:00", + "date_to": "2020-07-01 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + _create_person(distinct_ids=["user_one"], team=self.team) + + # full run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2020-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2020-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2020-05-01 02:00:00") + + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + self.assertEqual( + results, + [ + { + "conversion_rate": 0.0, + "is_period_final": True, + "reached_from_step_count": 0, + "reached_to_step_count": 0, + "timestamp": date(2020, 1, 1), + }, + { + "conversion_rate": 0.0, + "is_period_final": True, + "reached_from_step_count": 0, + "reached_to_step_count": 0, + "timestamp": date(2020, 2, 1), + }, + { + "conversion_rate": 0.0, + "is_period_final": True, + "reached_from_step_count": 0, + "reached_to_step_count": 0, + "timestamp": date(2020, 3, 1), + }, + { + "conversion_rate": 0.0, + "is_period_final": True, + "reached_from_step_count": 0, + "reached_to_step_count": 0, + "timestamp": date(2020, 4, 1), + }, + { + "conversion_rate": 100.0, + "is_period_final": True, + "reached_from_step_count": 1, + "reached_to_step_count": 1, + "timestamp": date(2020, 5, 1), + }, + { + "conversion_rate": 0.0, + "is_period_final": True, + "reached_from_step_count": 0, + "reached_to_step_count": 0, + "timestamp": date(2020, 6, 1), + }, + { + "conversion_rate": 0.0, + "is_period_final": True, + "reached_from_step_count": 0, + "reached_to_step_count": 0, + "timestamp": date(2020, 7, 1), + }, + ], + ) + + persons, _ = self._get_people_at_step(filter, "2020-05-01 00:00:00", False) + + self.assertEqual( + [person["distinct_ids"] for person in persons], [["user_one"]], + ) + + def test_all_date_range(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "all", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + _create_person(distinct_ids=["user_one"], team=self.team) + + # full run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + + with freeze_time("2021-05-20T13:01:01Z"): + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + self.assertEqual(20, len(results)) + + persons, _ = self._get_people_at_step(filter, "2021-05-01 00:00:00", False) + + self.assertEqual( + [person["distinct_ids"] for person in persons], [["user_one"]], + ) + + def test_all_results_for_day_interval(self): + self._create_sample_data() + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + saturday = results[0] # 5/1 + self.assertEqual(3, saturday["reached_to_step_count"]) + self.assertEqual(3, saturday["reached_from_step_count"]) + self.assertEqual(100, saturday["conversion_rate"]) + self.assertEqual(True, saturday["is_period_final"]) + + sunday = results[1] # 5/2 + self.assertEqual(0, sunday["reached_to_step_count"]) + self.assertEqual(2, sunday["reached_from_step_count"]) + self.assertEqual(0, sunday["conversion_rate"]) + self.assertEqual(True, sunday["is_period_final"]) + + monday = results[2] # 5/3 + self.assertEqual(0, monday["reached_to_step_count"]) + self.assertEqual(0, monday["reached_from_step_count"]) + self.assertEqual(0, monday["conversion_rate"]) + self.assertEqual(True, monday["is_period_final"]) + + tuesday = results[3] # 5/4 + self.assertEqual(0, tuesday["reached_to_step_count"]) + self.assertEqual(0, tuesday["reached_from_step_count"]) + self.assertEqual(0, tuesday["conversion_rate"]) + self.assertEqual(True, tuesday["is_period_final"]) + + wednesday = results[4] # 5/5 + self.assertEqual(0, wednesday["reached_to_step_count"]) + self.assertEqual(0, wednesday["reached_from_step_count"]) + self.assertEqual(0, wednesday["conversion_rate"]) + self.assertEqual(True, wednesday["is_period_final"]) + + thursday = results[5] # 5/6 + self.assertEqual(0, thursday["reached_to_step_count"]) + self.assertEqual(1, thursday["reached_from_step_count"]) + self.assertEqual(0, thursday["conversion_rate"]) + self.assertEqual(True, thursday["is_period_final"]) + + friday = results[6] # 5/7 + self.assertEqual(0, friday["reached_to_step_count"]) + self.assertEqual(0, friday["reached_from_step_count"]) + self.assertEqual(0, friday["conversion_rate"]) + self.assertEqual(True, friday["is_period_final"]) + + def test_window_size_one_day(self): + self._create_sample_data() + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + saturday = results[0] # 5/1 + self.assertEqual(1, saturday["reached_to_step_count"]) + self.assertEqual(3, saturday["reached_from_step_count"]) + self.assertEqual(33.33, saturday["conversion_rate"]) + self.assertEqual(True, saturday["is_period_final"]) + + sunday = results[1] # 5/2 + self.assertEqual(0, sunday["reached_to_step_count"]) + self.assertEqual(2, sunday["reached_from_step_count"]) + self.assertEqual(0, sunday["conversion_rate"]) + self.assertEqual(True, sunday["is_period_final"]) + + monday = results[2] # 5/3 + self.assertEqual(0, monday["reached_to_step_count"]) + self.assertEqual(0, monday["reached_from_step_count"]) + self.assertEqual(0, monday["conversion_rate"]) + self.assertEqual(True, monday["is_period_final"]) + + tuesday = results[3] # 5/4 + self.assertEqual(0, tuesday["reached_to_step_count"]) + self.assertEqual(0, tuesday["reached_from_step_count"]) + self.assertEqual(0, tuesday["conversion_rate"]) + self.assertEqual(True, tuesday["is_period_final"]) + + wednesday = results[4] # 5/5 + self.assertEqual(0, wednesday["reached_to_step_count"]) + self.assertEqual(0, wednesday["reached_from_step_count"]) + self.assertEqual(0, wednesday["conversion_rate"]) + self.assertEqual(True, wednesday["is_period_final"]) + + thursday = results[5] # 5/6 + self.assertEqual(0, thursday["reached_to_step_count"]) + self.assertEqual(1, thursday["reached_from_step_count"]) + self.assertEqual(0, thursday["conversion_rate"]) + self.assertEqual(True, thursday["is_period_final"]) + + friday = results[6] # 5/7 + self.assertEqual(0, friday["reached_to_step_count"]) + self.assertEqual(0, friday["reached_from_step_count"]) + self.assertEqual(0, friday["conversion_rate"]) + self.assertEqual(True, friday["is_period_final"]) + + def test_period_not_final(self): + now = datetime.now() + + _create_person(distinct_ids=["user_eight"], team=self.team) + _create_event(event="step one", distinct_id="user_eight", team=self.team, timestamp=now.strftime(FORMAT_TIME)) + _create_event( + event="step two", + distinct_id="user_eight", + team=self.team, + timestamp=(now + timedelta(minutes=1)).strftime(FORMAT_TIME), + ) + _create_event( + event="step three", + distinct_id="user_eight", + team=self.team, + timestamp=(now + timedelta(minutes=2)).strftime(FORMAT_TIME), + ) + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": (now - timedelta(1)).strftime(FORMAT_TIME), + "date_to": now.strftime(FORMAT_TIME_DAY_END), + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + self.assertEqual(len(results), 2) + + day = results[0] # yesterday + self.assertEqual(day["reached_from_step_count"], 0) + self.assertEqual(day["reached_to_step_count"], 0) + self.assertEqual(day["conversion_rate"], 0) + self.assertEqual( + day["timestamp"].replace(tzinfo=pytz.UTC), + (datetime(now.year, now.month, now.day) - timedelta(1)).replace(tzinfo=pytz.UTC), + ) + self.assertEqual(day["is_period_final"], True) # this window can't be affected anymore + + day = results[1] # today + self.assertEqual(day["reached_from_step_count"], 1) + self.assertEqual(day["reached_to_step_count"], 1) + self.assertEqual(day["conversion_rate"], 100) + self.assertEqual( + day["timestamp"].replace(tzinfo=pytz.UTC), datetime(now.year, now.month, now.day).replace(tzinfo=pytz.UTC) + ) + self.assertEqual(day["is_period_final"], False) # events coming in now may stil affect this + + def test_two_runs_by_single_user_in_one_period(self): + _create_person(distinct_ids=["user_one"], team=self.team) + + # 1st full run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + + # 2nd full run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 13:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 14:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 15:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-01 23:59:59", + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + self.assertEqual(len(results), 1) + + day = results[0] # 2021-05-01 + self.assertEqual(day["reached_from_step_count"], 1) + self.assertEqual(day["reached_to_step_count"], 1) + self.assertEqual(day["conversion_rate"], 100) + self.assertEqual(day["is_period_final"], True) + + def test_steps_performed_in_period_but_in_reverse(self): + _create_person(distinct_ids=["user_one"], team=self.team) + + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 03:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-01 23:59:59", + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + self.assertEqual(len(results), 1) + + day_1 = results[0] # 2021-05-01 + self.assertEqual(day_1["reached_from_step_count"], 1) + self.assertEqual(day_1["reached_to_step_count"], 0) + self.assertEqual(day_1["conversion_rate"], 0) + self.assertEqual(day_1["is_period_final"], True) + + def test_one_person_in_multiple_periods_and_windows(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + + # 1st user's 1st complete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 03:00:00") + + # 1st user's incomplete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-03 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 02:00:00") + + # 2nd user's incomplete run + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-04 18:00:00") + + # 1st user's 2nd complete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-04 11:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-04 12:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-04 13:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-04 23:59:59", + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + self.assertEqual(len(results), 4) + + day_1 = results[0] # 2021-05-01 + self.assertEqual(day_1["reached_from_step_count"], 1) + self.assertEqual(day_1["reached_to_step_count"], 1) + self.assertEqual(day_1["conversion_rate"], 100) + self.assertEqual(day_1["is_period_final"], True) + + day_2 = results[1] # 2021-05-02 + self.assertEqual(day_2["reached_from_step_count"], 0) + self.assertEqual(day_2["reached_to_step_count"], 0) + self.assertEqual(day_2["conversion_rate"], 0) + self.assertEqual(day_2["is_period_final"], True) + + day_3 = results[2] # 2021-05-03 + self.assertEqual(day_3["reached_from_step_count"], 1) + self.assertEqual(day_3["reached_to_step_count"], 0) + self.assertEqual(day_3["conversion_rate"], 0) + self.assertEqual(day_3["is_period_final"], True) + + day_4 = results[3] # 2021-05-04 + self.assertEqual(day_4["reached_from_step_count"], 2) + self.assertEqual(day_4["reached_to_step_count"], 1) + self.assertEqual(day_4["conversion_rate"], 50) + self.assertEqual(day_4["is_period_final"], True) + + # 1 user who dropped off starting # 2021-05-04 + funnel_trends_persons_existent_dropped_off_results, _ = self._get_people_at_step( + filter, "2021-05-04 00:00:00", True + ) + + self.assertEqual( + len(funnel_trends_persons_existent_dropped_off_results), 1, + ) + self.assertEqual( + [person["distinct_ids"] for person in funnel_trends_persons_existent_dropped_off_results], [["user_two"]], + ) + + # 1 user who converted starting # 2021-05-04 + funnel_trends_persons_existent_dropped_off_results, _ = self._get_people_at_step( + filter, "2021-05-04 00:00:00", False + ) + + self.assertEqual( + len(funnel_trends_persons_existent_dropped_off_results), 1, + ) + self.assertEqual( + [person["distinct_ids"] for person in funnel_trends_persons_existent_dropped_off_results], [["user_one"]], + ) + + def test_from_second_step(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + _create_person(distinct_ids=["user_three"], team=self.team) + _create_person(distinct_ids=["user_four"], team=self.team) + + # 1st user's complete run - should fall into the 2021-05-01 bucket even though counting only from 2nd step + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-02 02:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-02 03:00:00") + + # 2nd user's incomplete run - should not count at all since not reaching 2nd step + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-01 01:00:00") + + # 3rd user's incomplete run - should not count at all since reaching 2nd step BUT not the 1st one + _create_event(event="step two", distinct_id="user_three", team=self.team, timestamp="2021-05-02 02:00:00") + _create_event(event="step three", distinct_id="user_three", team=self.team, timestamp="2021-05-02 03:00:00") + + # 4th user's incomplete run - should fall into the 2021-05-02 bucket as entered but not completed + _create_event(event="step one", distinct_id="user_four", team=self.team, timestamp="2021-05-02 01:00:00") + _create_event(event="step two", distinct_id="user_four", team=self.team, timestamp="2021-05-02 02:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-02 23:59:59", + "funnel_window_days": 3, + "funnel_from_step": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + self.assertEqual(len(results), 2) + + day_1 = results[0] # 2021-05-01 + self.assertEqual(day_1["reached_from_step_count"], 1) + self.assertEqual(day_1["reached_to_step_count"], 1) + self.assertEqual(day_1["conversion_rate"], 100) + self.assertEqual(day_1["is_period_final"], True) + + day_2 = results[1] # 2021-05-02 + self.assertEqual(day_2["reached_from_step_count"], 1) + self.assertEqual(day_2["reached_to_step_count"], 0) + self.assertEqual(day_2["conversion_rate"], 0) + self.assertEqual(day_2["is_period_final"], True) + + def test_to_second_step(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + _create_person(distinct_ids=["user_three"], team=self.team) + _create_person(distinct_ids=["user_four"], team=self.team) + + # 1st user's complete run - should fall into the 2021-05-01 bucket + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-02 02:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-02 03:00:00") + + # 2nd user's incomplete run - should count as incomplete + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-01 01:00:00") + + # 3rd user's incomplete run - should not count at all since reaching 2nd step BUT not the 1st one + _create_event(event="step two", distinct_id="user_three", team=self.team, timestamp="2021-05-02 02:00:00") + _create_event(event="step three", distinct_id="user_three", team=self.team, timestamp="2021-05-02 03:00:00") + + # 4th user's incomplete run - should fall into the 2021-05-02 bucket as entered and completed + _create_event(event="step one", distinct_id="user_four", team=self.team, timestamp="2021-05-02 01:00:00") + _create_event(event="step two", distinct_id="user_four", team=self.team, timestamp="2021-05-02 02:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-02 23:59:59", + "funnel_window_days": 3, + "funnel_to_step": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel)._exec_query() + + self.assertEqual(len(results), 2) + + day_1 = results[0] # 2021-05-01 + self.assertEqual(day_1["reached_from_step_count"], 2) + self.assertEqual(day_1["reached_to_step_count"], 1) + self.assertEqual(day_1["conversion_rate"], 50) + self.assertEqual(day_1["is_period_final"], True) + + day_2 = results[1] # 2021-05-02 + self.assertEqual(day_2["reached_from_step_count"], 1) + self.assertEqual(day_2["reached_to_step_count"], 1) + self.assertEqual(day_2["conversion_rate"], 100) + self.assertEqual(day_2["is_period_final"], True) + + def test_one_person_in_multiple_periods_and_windows_in_unordered_funnel(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + + # 1st user's 1st complete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 03:00:00") + + # 1st user's incomplete run + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 01:00:00") + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-03 02:00:00") + + # 2nd user's incomplete run + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-04 18:00:00") + + # 1st user's 2nd complete run + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-04 11:00:00") + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-04 12:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-04 13:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-04 23:59:59", + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnelUnordered)._exec_query() + + self.assertEqual(len(results), 4) + + day_1 = results[0] # 2021-05-01 + self.assertEqual(day_1["reached_from_step_count"], 1) + self.assertEqual(day_1["reached_to_step_count"], 1) + self.assertEqual(day_1["conversion_rate"], 100) + self.assertEqual(day_1["is_period_final"], True) + + day_2 = results[1] # 2021-05-02 + self.assertEqual(day_2["reached_from_step_count"], 0) + self.assertEqual(day_2["reached_to_step_count"], 0) + self.assertEqual(day_2["conversion_rate"], 0) + self.assertEqual(day_2["is_period_final"], True) + + day_3 = results[2] # 2021-05-03 + self.assertEqual(day_3["reached_from_step_count"], 1) + self.assertEqual(day_3["reached_to_step_count"], 0) + self.assertEqual(day_3["conversion_rate"], 0) + self.assertEqual(day_3["is_period_final"], True) + + day_4 = results[3] # 2021-05-04 + self.assertEqual(day_4["reached_from_step_count"], 2) + self.assertEqual(day_4["reached_to_step_count"], 1) + self.assertEqual(day_4["conversion_rate"], 50) + self.assertEqual(day_4["is_period_final"], True) + + # 1 user who dropped off starting # 2021-05-04 + funnel_trends_persons_existent_dropped_off_results, _ = self._get_people_at_step( + filter, "2021-05-04 00:00:00", True, ClickhouseFunnelUnordered + ) + + self.assertEqual( + len(funnel_trends_persons_existent_dropped_off_results), 1, + ) + self.assertEqual( + [person["distinct_ids"] for person in funnel_trends_persons_existent_dropped_off_results], [["user_two"]], + ) + + # 1 user who converted starting # 2021-05-04 + funnel_trends_persons_existent_dropped_off_results, _ = self._get_people_at_step( + filter, "2021-05-04 00:00:00", False, ClickhouseFunnelUnordered + ) + + self.assertEqual( + len(funnel_trends_persons_existent_dropped_off_results), 1, + ) + self.assertEqual( + [person["distinct_ids"] for person in funnel_trends_persons_existent_dropped_off_results], [["user_one"]], + ) + + def test_one_person_in_multiple_periods_and_windows_in_strict_funnel(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + + # 1st user's 1st complete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-01 02:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-01 03:00:00") + + # 1st user's incomplete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-03 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 02:00:00") + # broken because strict + _create_event(event="blah", distinct_id="user_one", team=self.team, timestamp="2021-05-03 02:30:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-03 03:00:00") + + # 2nd user's incomplete run + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-04 18:00:00") + # broken because strict + _create_event(event="blah", distinct_id="user_two", team=self.team, timestamp="2021-05-04 18:20:00") + _create_event(event="step two", distinct_id="user_two", team=self.team, timestamp="2021-05-04 19:00:00") + + # 1st user's 2nd complete run + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-04 11:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-04 12:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-04 13:00:00") + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-04 23:59:59", + "funnel_window_days": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + ) + results = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnelStrict)._exec_query() + + self.assertEqual(len(results), 4) + + day_1 = results[0] # 2021-05-01 + self.assertEqual(day_1["reached_from_step_count"], 1) + self.assertEqual(day_1["reached_to_step_count"], 1) + self.assertEqual(day_1["conversion_rate"], 100) + self.assertEqual(day_1["is_period_final"], True) + + day_2 = results[1] # 2021-05-02 + self.assertEqual(day_2["reached_from_step_count"], 0) + self.assertEqual(day_2["reached_to_step_count"], 0) + self.assertEqual(day_2["conversion_rate"], 0) + self.assertEqual(day_2["is_period_final"], True) + + day_3 = results[2] # 2021-05-03 + self.assertEqual(day_3["reached_from_step_count"], 1) + self.assertEqual(day_3["reached_to_step_count"], 0) + self.assertEqual(day_3["conversion_rate"], 0) + self.assertEqual(day_3["is_period_final"], True) + + day_4 = results[3] # 2021-05-04 + self.assertEqual(day_4["reached_from_step_count"], 2) + self.assertEqual(day_4["reached_to_step_count"], 1) + self.assertEqual(day_4["conversion_rate"], 50) + self.assertEqual(day_4["is_period_final"], True) + + def test_funnel_step_breakdown_event(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_event( + event="step one", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + + _create_person(distinct_ids=["user_two"], team=self.team) + _create_event( + event="step one", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-02 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + + _create_person(distinct_ids=["user_three"], team=self.team) + _create_event( + event="step one", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Safari"}, + ) + _create_event( + event="step two", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-04 00:00:00", + properties={"$browser": "Safari"}, + ) + _create_event( + event="step three", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Safari"}, + ) + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-13 23:59:59", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + "breakdown_type": "event", + "breakdown": "$browser", + } + ) + funnel_trends = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel) + result = funnel_trends.run() + + self.assertEqual(len(result), 2) + + for res in result: + if res["breakdown_value"] == "Chrome": + self.assertEqual(res["data"], [100.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + elif res["breakdown_value"] == "Safari": + self.assertEqual(res["data"], [0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + else: + self.fail(msg="Invalid breakdown value") + + def test_funnel_step_breakdown_person(self): + _create_person( + distinct_ids=["user_one"], team=self.team, properties={"$browser": "Chrome"}, + ) + _create_event( + event="step one", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + + _create_person( + distinct_ids=["user_two"], team=self.team, properties={"$browser": "Chrome"}, + ) + _create_event( + event="step one", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-02 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + + _create_person( + distinct_ids=["user_three"], team=self.team, properties={"$browser": "Safari"}, + ) + _create_event( + event="step one", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Safari"}, + ) + _create_event( + event="step two", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-04 00:00:00", + properties={"$browser": "Safari"}, + ) + _create_event( + event="step three", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Safari"}, + ) + + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-13 23:59:59", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + "breakdown_type": "person", + "breakdown": "$browser", + } + ) + funnel_trends = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel) + result = funnel_trends.run() + + self.assertEqual(len(result), 2) + + for res in result: + if res["breakdown_value"] == "Chrome": + self.assertEqual(res["data"], [100.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + elif res["breakdown_value"] == "Safari": + self.assertEqual(res["data"], [0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + else: + self.fail(msg="Invalid breakdown value") + + def test_funnel_trend_cohort_breakdown(self): + _create_person( + distinct_ids=["user_one"], team=self.team, properties={"key": "value"}, + ) + _create_event( + event="step one", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id="user_one", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + + _create_person( + distinct_ids=["user_two"], team=self.team, properties={"key": "value"}, + ) + _create_event( + event="step one", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-02 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id="user_two", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + + _create_person( + distinct_ids=["user_three"], team=self.team, properties={"$browser": "Safari"}, + ) + _create_event( + event="step one", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Safari"}, + ) + _create_event( + event="step two", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-04 00:00:00", + properties={"$browser": "Safari"}, + ) + _create_event( + event="step three", + distinct_id="user_three", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Safari"}, + ) + + cohort = Cohort.objects.create(team=self.team, name="test_cohort", groups=[{"properties": {"key": "value"}}]) + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "display": TRENDS_LINEAR, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-13 23:59:59", + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + "breakdown_type": "cohort", + "breakdown": [cohort.pk], + } + ) + funnel_trends = ClickhouseFunnelTrends(filter, self.team, ClickhouseFunnel) + + result = funnel_trends.run() + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["data"], [100.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_unordered.py b/ee/clickhouse/queries/funnels/test/test_funnel_unordered.py new file mode 100644 index 0000000000000..3d0ee2646dd5b --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_unordered.py @@ -0,0 +1,745 @@ +from uuid import uuid4 + +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.group import create_group +from ee.clickhouse.queries.funnels.funnel_unordered import ClickhouseFunnelUnordered +from ee.clickhouse.queries.funnels.funnel_unordered_persons import ClickhouseFunnelUnorderedPersons +from ee.clickhouse.queries.funnels.test.breakdown_cases import funnel_breakdown_test_factory +from ee.clickhouse.queries.funnels.test.conversion_time_cases import funnel_conversion_time_test_factory +from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.action import Action +from posthog.models.action_step import ActionStep +from posthog.models.filters import Filter +from posthog.models.group_type_mapping import GroupTypeMapping +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + +FORMAT_TIME = "%Y-%m-%d 00:00:00" + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +def _create_action(**kwargs): + team = kwargs.pop("team") + name = kwargs.pop("name") + properties = kwargs.pop("properties", {}) + action = Action.objects.create(team=team, name=name) + ActionStep.objects.create(action=action, event=name, properties=properties) + return action + + +class TestFunnelUnorderedStepsBreakdown(ClickhouseTestMixin, funnel_breakdown_test_factory(ClickhouseFunnelUnordered, ClickhouseFunnelUnorderedPersons, _create_event, _create_action, _create_person)): # type: ignore + maxDiff = None + + def test_funnel_step_breakdown_event_single_person_events_with_multiple_properties(self): + # overriden from factory + + filters = { + "events": [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}], + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown_type": "event", + "breakdown": "$browser", + } + + filter = Filter(data=filters) + funnel = ClickhouseFunnelUnordered(filter, self.team) + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T13:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + result = funnel.run() + self.assertEqual( + result[0], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [], + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [], + "count": 0, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Chrome", + "breakdown_value": "Chrome", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Chrome"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Chrome"), []) + + self.assertEqual( + result[1], + [ + { + "action_id": "sign up", + "name": "sign up", + "custom_name": None, + "order": 0, + "people": [], + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + { + "action_id": "play movie", + "name": "play movie", + "custom_name": None, + "order": 1, + "people": [], + "count": 1, + "type": "events", + "average_conversion_time": 3600, + "median_conversion_time": 3600, + "breakdown": "Safari", + "breakdown_value": "Safari", + }, + ], + ) + self.assertCountEqual(self._get_people_at_step(filter, 1, "Safari"), [person1.uuid]) + self.assertCountEqual(self._get_people_at_step(filter, 2, "Safari"), [person1.uuid]) + + +class TestFunnelUnorderedStepsConversionTime(ClickhouseTestMixin, funnel_conversion_time_test_factory(ClickhouseFunnelUnordered, ClickhouseFunnelUnorderedPersons, _create_event, _create_person)): # type: ignore + maxDiff = None + pass + + +class TestFunnelUnorderedSteps(ClickhouseTestMixin, APIBaseTest): + def _get_people_at_step(self, filter, funnel_step): + person_filter = filter.with_data({"funnel_step": funnel_step}) + result = ClickhouseFunnelUnorderedPersons(person_filter, self.team)._exec_query() + return [row[0] for row in result] + + def test_basic_unordered_funnel(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "events": [ + {"id": "user signed up", "order": 0}, + {"id": "$pageview", "order": 1}, + {"id": "insight viewed", "order": 2}, + ], + } + ) + + funnel = ClickhouseFunnelUnordered(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview1") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_pageview1") + + person3_stopped_after_insight_view = _create_person( + distinct_ids=["stopped_after_insightview"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview") + + person4_stopped_after_insight_view_reverse_order = _create_person( + distinct_ids=["stopped_after_insightview2"], team_id=self.team.pk + ) + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview2") + + person5_stopped_after_insight_view_random = _create_person( + distinct_ids=["stopped_after_insightview3"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview3") + + person6_did_only_insight_view = _create_person( + distinct_ids=["stopped_after_insightview4"], team_id=self.team.pk + ) + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview4") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview4") + + person7_did_only_pageview = _create_person(distinct_ids=["stopped_after_insightview5"], team_id=self.team.pk) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview5") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview5") + + person8_didnot_signup = _create_person(distinct_ids=["stopped_after_insightview6"], team_id=self.team.pk) + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview6") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview6") + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 8) + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[1]["count"], 5) + self.assertEqual(result[2]["name"], "insight viewed") + self.assertEqual(result[2]["count"], 3) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_reverse_order.uuid, + person5_stopped_after_insight_view_random.uuid, + person6_did_only_insight_view.uuid, + person7_did_only_pageview.uuid, + person8_didnot_signup.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [ + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_reverse_order.uuid, + person5_stopped_after_insight_view_random.uuid, + person8_didnot_signup.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, -2), + [person1_stopped_after_signup.uuid, person6_did_only_insight_view.uuid, person7_did_only_pageview.uuid,], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), + [ + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_reverse_order.uuid, + person5_stopped_after_insight_view_random.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, -3), + [person2_stopped_after_one_pageview.uuid, person8_didnot_signup.uuid,], + ) + + def test_big_multi_step_unordered_funnel(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "events": [ + {"id": "user signed up", "order": 0}, + {"id": "$pageview", "order": 1}, + {"id": "insight viewed", "order": 2}, + {"id": "crying", "order": 3}, + ], + } + ) + + funnel = ClickhouseFunnelUnordered(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_signup1") + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_pageview1") + _create_event(team=self.team, event="crying", distinct_id="stopped_after_pageview1") + + person3_stopped_after_insight_view = _create_person( + distinct_ids=["stopped_after_insightview"], team_id=self.team.pk + ) + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview") + + person4_stopped_after_insight_view_reverse_order = _create_person( + distinct_ids=["stopped_after_insightview2"], team_id=self.team.pk + ) + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="crying", distinct_id="stopped_after_insightview2") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview2") + + person5_stopped_after_insight_view_random = _create_person( + distinct_ids=["stopped_after_insightview3"], team_id=self.team.pk + ) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="user signed up", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="crying", distinct_id="stopped_after_insightview3") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview3") + + person6_did_only_insight_view = _create_person( + distinct_ids=["stopped_after_insightview4"], team_id=self.team.pk + ) + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview4") + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview4") + + person7_did_only_pageview = _create_person(distinct_ids=["stopped_after_insightview5"], team_id=self.team.pk) + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview5") + _create_event(team=self.team, event="blaah blaa", distinct_id="stopped_after_insightview5") + + person8_didnot_signup = _create_person(distinct_ids=["stopped_after_insightview6"], team_id=self.team.pk) + _create_event(team=self.team, event="insight viewed", distinct_id="stopped_after_insightview6") + _create_event(team=self.team, event="$pageview", distinct_id="stopped_after_insightview6") + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 8) + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[1]["count"], 5) + self.assertEqual(result[2]["name"], "insight viewed") + self.assertEqual(result[2]["count"], 3) + self.assertEqual(result[3]["name"], "crying") + self.assertEqual(result[3]["count"], 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_reverse_order.uuid, + person5_stopped_after_insight_view_random.uuid, + person6_did_only_insight_view.uuid, + person7_did_only_pageview.uuid, + person8_didnot_signup.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [ + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_reverse_order.uuid, + person5_stopped_after_insight_view_random.uuid, + person8_didnot_signup.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), + [ + person3_stopped_after_insight_view.uuid, + person4_stopped_after_insight_view_reverse_order.uuid, + person5_stopped_after_insight_view_random.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 4), [person5_stopped_after_insight_view_random.uuid,], + ) + + def test_basic_unordered_funnel_conversion_times(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "events": [ + {"id": "user signed up", "order": 0}, + {"id": "$pageview", "order": 1}, + {"id": "insight viewed", "order": 2}, + ], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + "funnel_window_days": "1", + } + ) + + funnel = ClickhouseFunnelUnordered(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2021-05-02 00:00:00" + ) + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event( + team=self.team, event="$pageview", distinct_id="stopped_after_pageview1", timestamp="2021-05-02 00:00:00" + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_pageview1", + timestamp="2021-05-02 01:00:00", + ) + + person3_stopped_after_insight_view = _create_person( + distinct_ids=["stopped_after_insightview"], team_id=self.team.pk + ) + _create_event( + team=self.team, + event="insight viewed", + distinct_id="stopped_after_insightview", + timestamp="2021-05-02 00:00:00", + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_insightview", + timestamp="2021-05-02 02:00:00", + ) + _create_event( + team=self.team, event="$pageview", distinct_id="stopped_after_insightview", timestamp="2021-05-02 04:00:00" + ) + + _create_event( + team=self.team, event="$pageview", distinct_id="stopped_after_insightview", timestamp="2021-05-03 00:00:00" + ) + _create_event( + team=self.team, + event="insight viewed", + distinct_id="stopped_after_insightview", + timestamp="2021-05-03 03:00:00", + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_insightview", + timestamp="2021-05-03 06:00:00", + ) + # Person 3 completes the funnel 2 times: + # First time: 2 hours + 2 hours = total 4 hours. + # Second time: 3 hours + 3 hours = total 6 hours. + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[1]["name"], "$pageview") + self.assertEqual(result[2]["name"], "insight viewed") + self.assertEqual(result[0]["count"], 3) + + self.assertEqual(result[1]["average_conversion_time"], 6300) + # 1 hour for Person 2, (2+3)/2 hours for Person 3, total = 3.5 hours, average = 3.5/2 = 1.75 hours + + self.assertEqual(result[2]["average_conversion_time"], 9000) + # (2+3)/2 hours for Person 3 = 2.5 hours + + self.assertCountEqual( + self._get_people_at_step(filter, 1), + [ + person1_stopped_after_signup.uuid, + person2_stopped_after_one_pageview.uuid, + person3_stopped_after_insight_view.uuid, + ], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 2), + [person2_stopped_after_one_pageview.uuid, person3_stopped_after_insight_view.uuid], + ) + + self.assertCountEqual( + self._get_people_at_step(filter, 3), [person3_stopped_after_insight_view.uuid], + ) + + def test_single_event_unordered_funnel(self): + filter = Filter( + data={ + "insight": INSIGHT_FUNNELS, + "events": [{"id": "user signed up", "order": 0},], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + } + ) + + funnel = ClickhouseFunnelUnordered(filter, self.team) + + person1_stopped_after_signup = _create_person(distinct_ids=["stopped_after_signup1"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id="stopped_after_signup1", timestamp="2021-05-02 00:00:00" + ) + + person2_stopped_after_one_pageview = _create_person( + distinct_ids=["stopped_after_pageview1"], team_id=self.team.pk + ) + _create_event( + team=self.team, event="$pageview", distinct_id="stopped_after_pageview1", timestamp="2021-05-02 00:00:00" + ) + _create_event( + team=self.team, + event="user signed up", + distinct_id="stopped_after_pageview1", + timestamp="2021-05-02 01:00:00", + ) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + + def test_funnel_exclusions_invalid_params(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + {"id": "blah", "type": "events", "order": 2}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + "exclusions": [{"id": "x", "type": "events", "funnel_from_step": 1, "funnel_to_step": 1},], + } + filter = Filter(data=filters) + self.assertRaises(ValidationError, lambda: ClickhouseFunnelUnordered(filter, self.team).run()) + + # partial windows not allowed for unordered + filter = filter.with_data( + {"exclusions": [{"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1}]} + ) + self.assertRaises(ValidationError, lambda: ClickhouseFunnelUnordered(filter, self.team).run()) + + def test_funnel_exclusions_full_window(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "funnel_window_days": 14, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "exclusions": [{"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1},], + } + filter = Filter(data=filters) + funnel = ClickhouseFunnelUnordered(filter, self.team) + + # event 1 + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="paid", distinct_id="person1", timestamp="2021-05-01 02:00:00") + + # event 2 + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 03:00:00") + _create_event(team=self.team, event="x", distinct_id="person2", timestamp="2021-05-01 03:30:00") + _create_event(team=self.team, event="paid", distinct_id="person2", timestamp="2021-05-01 04:00:00") + + # event 3 + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="paid", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + result = funnel.run() + + self.assertEqual(len(result), 2) + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 3) + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 2) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person2.uuid, person3.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1.uuid, person3.uuid], + ) + + def test_advanced_funnel_multiple_exclusions_between_steps(self): + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1}, + {"id": "insight viewed", "type": "events", "order": 2}, + {"id": "invite teammate", "type": "events", "order": 3}, + {"id": "pageview2", "type": "events", "order": 4}, + ], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-14 00:00:00", + "insight": INSIGHT_FUNNELS, + "exclusions": [ + {"id": "x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 4}, + {"id": "y", "type": "events", "funnel_from_step": 0, "funnel_to_step": 4}, + ], + } + + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person1", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="x", distinct_id="person1", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="$pageview", distinct_id="person1", timestamp="2021-05-01 03:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person1", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="y", distinct_id="person1", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person1", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="pageview2", distinct_id="person1", timestamp="2021-05-01 06:00:00") + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person2", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="y", distinct_id="person2", timestamp="2021-05-01 01:30:00") + _create_event(team=self.team, event="$pageview", distinct_id="person2", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person2", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="y", distinct_id="person2", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person2", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person2", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person2", timestamp="2021-05-01 06:00:00") + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person3", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 01:30:00") + _create_event(team=self.team, event="$pageview", distinct_id="person3", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person3", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person3", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person3", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person3", timestamp="2021-05-01 06:00:00") + + person4 = _create_person(distinct_ids=["person4"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person4", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="$pageview", distinct_id="person4", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person4", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person4", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="pageview2", distinct_id="person4", timestamp="2021-05-01 06:00:00") + + person5 = _create_person(distinct_ids=["person5"], team_id=self.team.pk) + _create_event(team=self.team, event="user signed up", distinct_id="person5", timestamp="2021-05-01 01:00:00") + _create_event(team=self.team, event="x", distinct_id="person5", timestamp="2021-05-01 01:30:00") + _create_event(team=self.team, event="$pageview", distinct_id="person5", timestamp="2021-05-01 02:00:00") + _create_event(team=self.team, event="x", distinct_id="person5", timestamp="2021-05-01 02:30:00") + _create_event(team=self.team, event="insight viewed", distinct_id="person5", timestamp="2021-05-01 04:00:00") + _create_event(team=self.team, event="y", distinct_id="person5", timestamp="2021-05-01 04:30:00") + _create_event(team=self.team, event="invite teammate", distinct_id="person5", timestamp="2021-05-01 05:00:00") + _create_event(team=self.team, event="x", distinct_id="person5", timestamp="2021-05-01 05:30:00") + _create_event(team=self.team, event="pageview2", distinct_id="person5", timestamp="2021-05-01 06:00:00") + + filter = Filter(data=filters) + funnel = ClickhouseFunnelUnordered(filter, self.team) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 5) + self.assertEqual(result[1]["count"], 2) + self.assertEqual(result[2]["count"], 1) + self.assertEqual(result[3]["count"], 1) + self.assertEqual(result[4]["count"], 1) + + self.assertCountEqual( + self._get_people_at_step(filter, 1), [person1.uuid, person2.uuid, person3.uuid, person4.uuid, person5.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 2), [person1.uuid, person4.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 3), [person4.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 4), [person4.uuid], + ) + self.assertCountEqual( + self._get_people_at_step(filter, 5), [person4.uuid], + ) + + @snapshot_clickhouse_queries + def test_unordered_funnel_with_groups(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"}) + + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:1", properties={}) + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:2", properties={}) + + filters = { + "events": [ + {"id": "user signed up", "type": "events", "order": 0}, + {"id": "paid", "type": "events", "order": 1}, + ], + "insight": INSIGHT_FUNNELS, + "aggregation_group_type_index": 0, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + } + + filter = Filter(data=filters, team=self.team) + funnel = ClickhouseFunnelUnordered(filter, self.team) + + _create_person(distinct_ids=["user_1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-03T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + + # different person, same group, so should count as step two in funnel + _create_person(distinct_ids=["user_2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="paid", + distinct_id="user_2", + timestamp="2020-01-02T14:00:00Z", + properties={"$group_0": "org:5"}, + ) + + # same person, different group, so should count as different step 1 in funnel + _create_event( + team=self.team, + event="user signed up", + distinct_id="user_1", + timestamp="2020-01-10T14:00:00Z", + properties={"$group_0": "org:6"}, + ) + + result = funnel.run() + + self.assertEqual(result[0]["name"], "user signed up") + self.assertEqual(result[0]["count"], 2) + + self.assertEqual(result[1]["name"], "paid") + self.assertEqual(result[1]["count"], 1) diff --git a/ee/clickhouse/queries/funnels/test/test_funnel_unordered_persons.py b/ee/clickhouse/queries/funnels/test/test_funnel_unordered_persons.py new file mode 100644 index 0000000000000..a40738366fef4 --- /dev/null +++ b/ee/clickhouse/queries/funnels/test/test_funnel_unordered_persons.py @@ -0,0 +1,139 @@ +from uuid import uuid4 + +import pytest + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.funnels.funnel_unordered_persons import ClickhouseFunnelUnorderedPersons +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + +FORMAT_TIME = "%Y-%m-%d 00:00:00" + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelUnorderedStepsPersons(ClickhouseTestMixin, APIBaseTest): + def _create_sample_data_multiple_dropoffs(self): + for i in range(5): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step three", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-05 00:00:00") + + for i in range(5, 15): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step two", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-03 00:00:00") + + for i in range(15, 35): + _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event(event="step one", distinct_id=f"user_{i}", team=self.team, timestamp="2021-05-01 00:00:00") + + def test_invalid_steps(self): + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": "blah", + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + with self.assertRaises(ValueError): + ClickhouseFunnelUnorderedPersons(filter, self.team).run() + + filter = filter.with_data({"funnel_step": -1}) + with pytest.raises(ValueError): + _, _ = ClickhouseFunnelUnorderedPersons(filter, self.team).run() + + def test_first_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 1, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelUnorderedPersons(filter, self.team).run() + self.assertEqual(35, len(results)) + + def test_last_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": 3, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelUnorderedPersons(filter, self.team).run() + self.assertEqual(5, len(results)) + + def test_second_step_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": -2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelUnorderedPersons(filter, self.team).run() + self.assertEqual(20, len(results)) + + def test_last_step_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_days": 7, + "funnel_step": -3, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + filter = Filter(data=data) + results, _ = ClickhouseFunnelUnorderedPersons(filter, self.team).run() + self.assertEqual(10, len(results)) diff --git a/ee/clickhouse/queries/groups_join_query.py b/ee/clickhouse/queries/groups_join_query.py new file mode 100644 index 0000000000000..1e2325fa01276 --- /dev/null +++ b/ee/clickhouse/queries/groups_join_query.py @@ -0,0 +1,54 @@ +from typing import Dict, List, Optional, Set, Tuple, Union + +from ee.clickhouse.materialized_columns.columns import ColumnName +from ee.clickhouse.models.property import extract_tables_and_properties, prop_filter_json_extract +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from posthog.models import Filter +from posthog.models.entity import Entity +from posthog.models.filters.path_filter import PathFilter +from posthog.models.filters.retention_filter import RetentionFilter +from posthog.models.property import Property + + +class GroupsJoinQuery: + """ + Query class responsible for joining with `groups` clickhouse table based on filters + """ + + _filter: Union[Filter, PathFilter, RetentionFilter] + _team_id: int + _column_optimizer: ColumnOptimizer + + def __init__( + self, + filter: Union[Filter, PathFilter, RetentionFilter], + team_id: int, + column_optimizer: Optional[ColumnOptimizer] = None, + ) -> None: + self._filter = filter + self._team_id = team_id + self._column_optimizer = column_optimizer or ColumnOptimizer(self._filter, self._team_id) + + def get_join_query(self) -> Tuple[str, Dict]: + join_queries, params = [], {} + + for group_type_index in self._column_optimizer.group_types_to_query: + var = f"group_index_{group_type_index}" + join_queries.append( + f""" + INNER JOIN ( + SELECT + group_key, + argMax(group_properties, _timestamp) AS group_properties_{group_type_index} + FROM groups + WHERE team_id = %(team_id)s AND group_type_index = %({var})s + GROUP BY group_key + ) groups_{group_type_index} + ON $group_{group_type_index} == groups_{group_type_index}.group_key + """ + ) + + params["team_id"] = self._team_id + params[var] = group_type_index + + return "\n".join(join_queries), params diff --git a/ee/clickhouse/queries/paths/__init__.py b/ee/clickhouse/queries/paths/__init__.py new file mode 100644 index 0000000000000..37195ad3e7747 --- /dev/null +++ b/ee/clickhouse/queries/paths/__init__.py @@ -0,0 +1,2 @@ +from .paths import ClickhousePaths +from .paths_persons import ClickhousePathsPersons diff --git a/ee/clickhouse/queries/paths/path_event_query.py b/ee/clickhouse/queries/paths/path_event_query.py new file mode 100644 index 0000000000000..5cafc956c4e69 --- /dev/null +++ b/ee/clickhouse/queries/paths/path_event_query.py @@ -0,0 +1,195 @@ +from typing import Any, Dict, List, Tuple + +from ee.clickhouse.models.property import get_property_string_expr +from ee.clickhouse.queries.event_query import ClickhouseEventQuery +from posthog.constants import ( + FUNNEL_PATH_AFTER_STEP, + FUNNEL_PATH_BEFORE_STEP, + FUNNEL_PATH_BETWEEN_STEPS, + PAGEVIEW_EVENT, + SCREEN_EVENT, +) +from posthog.models.filters.path_filter import PathFilter +from posthog.models.team import Team + + +class PathEventQuery(ClickhouseEventQuery): + FUNNEL_PERSONS_ALIAS = "funnel_persons" + _filter: PathFilter + + def get_query(self) -> Tuple[str, Dict[str, Any]]: + + funnel_paths_timestamp = "" + funnel_paths_join = "" + funnel_paths_filter = "" + + if self._filter.funnel_paths == FUNNEL_PATH_AFTER_STEP or self._filter.funnel_paths == FUNNEL_PATH_BEFORE_STEP: + # used when looking for paths up to a dropoff point to account for events happening between the latest even and when the person is deemed dropped off + funnel_window = ( + f"+ INTERVAL {self._filter.funnel_window_interval} {self._filter.funnel_window_interval_unit_ch()}" + ) + operator = ">=" if self._filter.funnel_paths == FUNNEL_PATH_AFTER_STEP else "<=" + + funnel_paths_timestamp = f"{self.FUNNEL_PERSONS_ALIAS}.timestamp AS target_timestamp" + funnel_paths_join = f"JOIN {self.FUNNEL_PERSONS_ALIAS} ON {self.FUNNEL_PERSONS_ALIAS}.person_id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id" + funnel_paths_filter = f"AND {self.EVENT_TABLE_ALIAS}.timestamp {operator} target_timestamp {funnel_window if self._filter.funnel_paths == FUNNEL_PATH_BEFORE_STEP and self._filter.funnel_step and self._filter.funnel_step < 0 else ''}" + elif self._filter.funnel_paths == FUNNEL_PATH_BETWEEN_STEPS: + funnel_paths_timestamp = f"{self.FUNNEL_PERSONS_ALIAS}.min_timestamp as min_timestamp, {self.FUNNEL_PERSONS_ALIAS}.max_timestamp as max_timestamp" + funnel_paths_join = f"JOIN {self.FUNNEL_PERSONS_ALIAS} ON {self.FUNNEL_PERSONS_ALIAS}.person_id = {self.DISTINCT_ID_TABLE_ALIAS}.person_id" + funnel_paths_filter = f"AND {self.EVENT_TABLE_ALIAS}.timestamp >= min_timestamp AND {self.EVENT_TABLE_ALIAS}.timestamp <= max_timestamp" + + # We don't use ColumnOptimizer to decide what to query because Paths query doesn't surface any filter properties + _fields = [ + f"{self.EVENT_TABLE_ALIAS}.timestamp AS timestamp", + f"{self.DISTINCT_ID_TABLE_ALIAS}.person_id as person_id" if self._should_join_distinct_ids else "", + funnel_paths_timestamp, + ] + + event_conditional = ( + f"if({self.EVENT_TABLE_ALIAS}.event = '{SCREEN_EVENT}', {self._get_screen_name_parsing()}, " + if self._should_query_screen() + else "if(0, '', " + ) + event_conditional += ( + f"if({self.EVENT_TABLE_ALIAS}.event = '{PAGEVIEW_EVENT}', {self._get_current_url_parsing()}, " + if self._should_query_url() + else "if(0, '', " + ) + event_conditional += f"{self.EVENT_TABLE_ALIAS}.event)) AS path_item_ungrouped" + + _fields.append(event_conditional) + + grouping_fields, grouping_params = self._get_grouping_fields() + _fields.extend(grouping_fields) + self.params.update(grouping_params) + + # remove empty strings + _fields = list(filter(None, _fields)) + + date_query, date_params = self._get_date_filter() + self.params.update(date_params) + + prop_filters = self._filter.properties + prop_query, prop_params = self._get_props(prop_filters) + self.params.update(prop_params) + + event_query, event_params = self._get_event_query() + self.params.update(event_params) + + person_query, person_params = self._get_person_query() + self.params.update(person_params) + + query = f""" + SELECT {','.join(_fields)} FROM events {self.EVENT_TABLE_ALIAS} + {self._get_disintct_id_query()} + {person_query} + {funnel_paths_join} + WHERE team_id = %(team_id)s + {event_query} + {date_query} + {prop_query} + {funnel_paths_filter} + ORDER BY {self.DISTINCT_ID_TABLE_ALIAS}.person_id, {self.EVENT_TABLE_ALIAS}.timestamp + """ + return query, self.params + + def _determine_should_join_distinct_ids(self) -> None: + self._should_join_distinct_ids = True + + def _get_grouping_fields(self) -> Tuple[List[str], Dict[str, Any]]: + _fields = [] + params = {} + + team: Team = Team.objects.get(pk=self._team_id) + + replacements = [] + + if self._filter.path_replacements and team.path_cleaning_filters and len(team.path_cleaning_filters) > 0: + replacements.extend(team.path_cleaning_filters) + + if self._filter.local_path_cleaning_filters and len(self._filter.local_path_cleaning_filters) > 0: + replacements.extend(self._filter.local_path_cleaning_filters) + + if len(replacements) > 0: + for idx, replacement in enumerate(replacements): + alias = replacement["alias"] + regex = replacement["regex"] + if idx == 0: + name = "path_item" if idx == len(replacements) - 1 else f"path_item_{idx}" + _fields.append( + f"replaceRegexpAll(path_item_ungrouped, %(regex_replacement_{idx})s, %(alias_{idx})s) as {name}" + ) + elif idx == len(replacements) - 1: + _fields.append( + f"replaceRegexpAll(path_item_{idx - 1}, %(regex_replacement_{idx})s, %(alias_{idx})s) as path_item" + ) + else: + _fields.append( + f"replaceRegexpAll(path_item_{idx - 1}, %(regex_replacement_{idx})s, %(alias_{idx})s) as path_item_{idx}" + ) + params[f"regex_replacement_{idx}"] = regex + params[f"alias_{idx}"] = alias + + else: + _fields.append("multiMatchAnyIndex(path_item_ungrouped, %(regex_groupings)s) AS group_index") + _fields.append("if(group_index > 0, %(groupings)s[group_index], path_item_ungrouped) AS path_item") + + return _fields, params + + def _get_current_url_parsing(self): + path_type, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties") + return f"if(length({path_type}) > 1, trim( TRAILING '/' FROM {path_type}), {path_type})" + + def _get_screen_name_parsing(self): + path_type, _ = get_property_string_expr("events", "$screen_name", "'$screen_name'", "properties") + return path_type + + def _get_event_query(self) -> Tuple[str, Dict[str, Any]]: + params: Dict[str, Any] = {} + + conditions = [] + or_conditions = [] + if self._filter.include_pageviews: + or_conditions.append(f"event = '{PAGEVIEW_EVENT}'") + + if self._filter.include_screenviews: + or_conditions.append(f"event = '{SCREEN_EVENT}'") + + if self._filter.include_all_custom_events: + or_conditions.append(f"NOT event LIKE '$%%'") + + if self._filter.custom_events: + or_conditions.append(f"event IN %(custom_events)s") + params["custom_events"] = self._filter.custom_events + + if or_conditions: + conditions.append(f"({' OR '.join(or_conditions)})") + + if self._filter.exclude_events: + conditions.append(f"NOT path_item IN %(exclude_events)s") + params["exclude_events"] = self._filter.exclude_events + + if conditions: + return f" AND {' AND '.join(conditions)}", params + + return "", {} + + def _should_query_url(self) -> bool: + if ( + self._filter.target_events == [] and self._filter.custom_events == [] + ) and PAGEVIEW_EVENT not in self._filter.exclude_events: + return True + elif self._filter.include_pageviews: + return True + + return False + + def _should_query_screen(self) -> bool: + if ( + self._filter.target_events == [] and self._filter.custom_events == [] + ) and SCREEN_EVENT not in self._filter.exclude_events: + return True + elif self._filter.include_screenviews: + return True + + return False diff --git a/ee/clickhouse/queries/paths/paths.py b/ee/clickhouse/queries/paths/paths.py new file mode 100644 index 0000000000000..1ca08afc82337 --- /dev/null +++ b/ee/clickhouse/queries/paths/paths.py @@ -0,0 +1,288 @@ +from collections import defaultdict +from re import escape +from typing import Dict, List, Literal, Optional, Tuple, Union, cast + +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.queries.funnels.funnel_persons import ClickhouseFunnelPersons +from ee.clickhouse.queries.paths.path_event_query import PathEventQuery +from ee.clickhouse.sql.paths.path import PATH_ARRAY_QUERY +from posthog.constants import FUNNEL_PATH_BETWEEN_STEPS, LIMIT, PATH_EDGE_LIMIT +from posthog.models import Filter, Team +from posthog.models.filters.path_filter import PathFilter + +EVENT_IN_SESSION_LIMIT_DEFAULT = 5 +SESSION_TIME_THRESHOLD_DEFAULT = 1800000 # milliseconds to 30 minutes +EDGE_LIMIT_DEFAULT = 50 + + +class ClickhousePaths: + _filter: PathFilter + _funnel_filter: Optional[Filter] + _team: Team + + def __init__(self, filter: PathFilter, team: Team, funnel_filter: Optional[Filter] = None) -> None: + self._filter = filter + self._team = team + self.params = { + "team_id": self._team.pk, + "event_in_session_limit": self._filter.step_limit or EVENT_IN_SESSION_LIMIT_DEFAULT, + "session_time_threshold": SESSION_TIME_THRESHOLD_DEFAULT, + "groupings": self._filter.path_groupings or None, + "regex_groupings": None, + } + self._funnel_filter = funnel_filter + + if self._filter.include_all_custom_events and self._filter.custom_events: + raise ValidationError("Cannot include all custom events and specific custom events in the same query") + + if not self._filter.limit: + self._filter = self._filter.with_data({LIMIT: 100}) + + if self._filter.path_groupings: + regex_groupings = [] + for grouping in self._filter.path_groupings: + regex_grouping = escape(grouping) + # don't allow arbitrary regex for now + regex_grouping = regex_grouping.replace("\\*", ".*") + regex_groupings.append(regex_grouping) + self.params["regex_groupings"] = regex_groupings + + if self._filter.edge_limit is None and not (self._filter.start_point and self._filter.end_point): + # no edge restriction when both start and end points are defined + self._filter = self._filter.with_data({PATH_EDGE_LIMIT: EDGE_LIMIT_DEFAULT}) + + if ( + self._filter.max_edge_weight + and self._filter.min_edge_weight + and self._filter.max_edge_weight < self._filter.min_edge_weight + ): + raise ValidationError("Max Edge weight can't be lower than min edge weight") + + def run(self, *args, **kwargs): + results = self._exec_query() + + if not self._filter.min_edge_weight and not self._filter.max_edge_weight: + results = self.validate_results(results) + + return self._format_results(results) + + def _format_results(self, results): + if not results or len(results) == 0: + return [] + + resp = [] + for res in results: + resp.append( + {"source": res[0], "target": res[1], "value": res[2], "average_conversion_time": res[3],} + ) + return resp + + def _exec_query(self) -> List[Tuple]: + query = self.get_query() + return sync_execute(query, self.params) + + def get_query(self) -> str: + + path_query = self.get_path_query() + funnel_cte = "" + + if self.should_query_funnel(): + funnel_cte = self.get_path_query_funnel_cte(cast(Filter, self._funnel_filter)) + + return funnel_cte + path_query + + def get_paths_per_person_query(self) -> str: + path_event_query, params = PathEventQuery(filter=self._filter, team_id=self._team.pk).get_query() + self.params.update(params) + + boundary_event_filter = self.get_target_point_filter() + target_clause, target_params = self.get_target_clause() + self.params.update(target_params) + + session_threshold_clause = self.get_session_threshold_clause() + + return PATH_ARRAY_QUERY.format( + path_event_query=path_event_query, + boundary_event_filter=boundary_event_filter, + target_clause=target_clause, + session_threshold_clause=session_threshold_clause, + ) + + def should_query_funnel(self) -> bool: + if self._filter.funnel_paths and self._funnel_filter: + return True + return False + + def get_path_query(self) -> str: + + paths_per_person_query = self.get_paths_per_person_query() + + self.params["edge_limit"] = self._filter.edge_limit + + edge_weight_filter, edge_weight_params = self.get_edge_weight_clause() + self.params.update(edge_weight_params) + + return f""" + SELECT last_path_key as source_event, + path_key as target_event, + COUNT(*) AS event_count, + avg(conversion_time) AS average_conversion_time + FROM ({paths_per_person_query}) + WHERE source_event IS NOT NULL + GROUP BY source_event, + target_event + {edge_weight_filter} + ORDER BY event_count DESC, + source_event, + target_event + {'LIMIT %(edge_limit)s' if self._filter.edge_limit else ''} + """ + + def get_path_query_funnel_cte(self, funnel_filter: Filter): + funnel_persons_generator = ClickhouseFunnelPersons( + funnel_filter, + self._team, + include_timestamp=bool(self._filter.funnel_paths), + include_preceding_timestamp=self._filter.funnel_paths == FUNNEL_PATH_BETWEEN_STEPS, + no_person_limit=True, + ) + funnel_persons_query = funnel_persons_generator.get_query() + funnel_persons_query_new_params = funnel_persons_query.replace("%(", "%(funnel_") + funnel_persons_param = funnel_persons_generator.params + new_funnel_params = {"funnel_" + str(key): val for key, val in funnel_persons_param.items()} + self.params.update(new_funnel_params) + return f""" + WITH {PathEventQuery.FUNNEL_PERSONS_ALIAS} AS ( + {funnel_persons_query_new_params} + ) + """ + + def get_edge_weight_clause(self) -> Tuple[str, Dict]: + params: Dict[str, int] = {} + + conditions = [] + + if self._filter.min_edge_weight: + params["min_edge_weight"] = self._filter.min_edge_weight + conditions.append("event_count >= %(min_edge_weight)s") + + if self._filter.max_edge_weight: + params["max_edge_weight"] = self._filter.max_edge_weight + conditions.append("event_count <= %(max_edge_weight)s") + + if conditions: + return f"HAVING {' AND '.join(conditions)}", params + + return "", params + + def get_target_point_filter(self) -> str: + if self._filter.end_point and self._filter.start_point: + return "WHERE start_target_index > 0 AND end_target_index > 0" + elif self._filter.end_point or self._filter.start_point: + return f"WHERE target_index > 0" + else: + return "" + + def get_session_threshold_clause(self) -> str: + + if self.should_query_funnel(): + self._funnel_filter = cast(Filter, self._funnel_filter) # typing mess + + # TODO: cleanup funnels interval interpolation mess so this can get cleaned up + if self._funnel_filter.funnel_window_interval: + funnel_window_interval = self._funnel_filter.funnel_window_interval + funnel_window_interval_unit = self._funnel_filter.funnel_window_interval_unit_ch() + elif self._funnel_filter.funnel_window_days: + funnel_window_interval = self._funnel_filter.funnel_window_days + funnel_window_interval_unit = "DAY" + else: + funnel_window_interval = 14 + funnel_window_interval_unit = "DAY" + # Not possible to directly compare two interval data types, so using a proxy Date. + return f"arraySplit(x -> if(toDateTime('2018-01-01') + toIntervalSecond(x.3 / 1000) < toDateTime('2018-01-01') + INTERVAL {funnel_window_interval} {funnel_window_interval_unit}, 0, 1), paths_tuple)" + + return "arraySplit(x -> if(x.3 < %(session_time_threshold)s, 0, 1), paths_tuple)" + + def get_target_clause(self) -> Tuple[str, Dict]: + params: Dict[str, Union[str, None]] = {"target_point": None, "secondary_target_point": None} + + if self._filter.end_point and self._filter.start_point: + params.update({"target_point": self._filter.end_point, "secondary_target_point": self._filter.start_point}) + return ( + """ + , indexOf(compact_path, %(secondary_target_point)s) as start_target_index + , if(start_target_index > 0, arraySlice(compact_path, start_target_index), compact_path) as start_filtered_path + , if(start_target_index > 0, arraySlice(timings, start_target_index), timings) as start_filtered_timings + , indexOf(start_filtered_path, %(target_point)s) as end_target_index + , if(end_target_index > 0, arrayResize(start_filtered_path, end_target_index), start_filtered_path) as filtered_path + , if(end_target_index > 0, arrayResize(start_filtered_timings, end_target_index), start_filtered_timings) as filtered_timings + , if(length(filtered_path) > %(event_in_session_limit)s, arrayConcat(arraySlice(filtered_path, 1, intDiv(%(event_in_session_limit)s,2)), ['...'], arraySlice(filtered_path, (-1)*intDiv(%(event_in_session_limit)s, 2), intDiv(%(event_in_session_limit)s, 2))), filtered_path) AS limited_path + , if(length(filtered_timings) > %(event_in_session_limit)s, arrayConcat(arraySlice(filtered_timings, 1, intDiv(%(event_in_session_limit)s, 2)), [filtered_timings[1+intDiv(%(event_in_session_limit)s, 2)]], arraySlice(filtered_timings, (-1)*intDiv(%(event_in_session_limit)s, 2), intDiv(%(event_in_session_limit)s, 2))), filtered_timings) AS limited_timings + """, + params, + ) + else: + path_limiting_clause, time_limiting_clause = self.get_filtered_path_ordering() + compacting_function = self.get_array_compacting_function() + params.update({"target_point": self._filter.end_point or self._filter.start_point}) + return ( + f""" + , indexOf(compact_path, %(target_point)s) as target_index + , if(target_index > 0, {compacting_function}(compact_path, target_index), compact_path) as filtered_path + , if(target_index > 0, {compacting_function}(timings, target_index), timings) as filtered_timings + , {path_limiting_clause} as limited_path + , {time_limiting_clause} as limited_timings + """, + params, + ) + + def get_array_compacting_function(self) -> Literal["arrayResize", "arraySlice"]: + if self._filter.end_point: + return "arrayResize" + else: + return "arraySlice" + + def get_filtered_path_ordering(self) -> Tuple[str, str]: + + if self._filter.end_point: + return ( + "arraySlice(filtered_path, (-1) * %(event_in_session_limit)s)", + "arraySlice(filtered_timings, (-1) * %(event_in_session_limit)s)", + ) + else: + return ( + "arraySlice(filtered_path, 1, %(event_in_session_limit)s)", + "arraySlice(filtered_timings, 1, %(event_in_session_limit)s)", + ) + + def validate_results(self, results): + # Query guarantees results list to be: + # 1. Directed, Acyclic Tree where each node has only 1 child + # 2. All start nodes beginning with 1_ + + seen = set() # source nodes that've been traversed + edges = defaultdict(list) + validated_results = [] + starting_nodes_stack = [] + + for result in results: + edges[result[0]].append(result[1]) + if result[0].startswith("1_"): + # All nodes with 1_ are valid starting nodes + starting_nodes_stack.append(result[0]) + + while starting_nodes_stack: + current_node = starting_nodes_stack.pop() + seen.add(current_node) + + for node in edges[current_node]: + if node not in seen: + starting_nodes_stack.append(node) + + for result in results: + if result[0] in seen: + validated_results.append(result) + + return validated_results diff --git a/ee/clickhouse/queries/paths/paths_persons.py b/ee/clickhouse/queries/paths/paths_persons.py new file mode 100644 index 0000000000000..59a5b46ff9d75 --- /dev/null +++ b/ee/clickhouse/queries/paths/paths_persons.py @@ -0,0 +1,82 @@ +from typing import cast + +from ee.clickhouse.queries.paths.paths import ClickhousePaths +from ee.clickhouse.sql.funnels.funnel import FUNNEL_PERSONS_BY_STEP_SQL +from posthog.models import Person +from posthog.models.filters.filter import Filter + + +class ClickhousePathsPersons(ClickhousePaths): + """ + `path_start_key`, `path_end_key`, and `path_dropoff_key` are three new params for this class. + These determine the start and end point of Paths you want. All of these are optional. + + Not specifying them means "get me all users on this path query". + + Only specifying `path_start_key` means "get me all users whose paths start at this key" + Only specifying `path_end_key` means "get me all users whose paths end at this key" + + Specifying both means "get me all users whose path starts at `start_key` and ends at `end_key`." + + Specifying `path_dropoff_key` means "get me users who dropped off after this key. If you specify + this key, the other two keys are invalid + + Note that: + Persons are calculated only between direct paths. There should not be any + other path item between start and end key. + """ + + def get_query(self): + + paths_per_person_query = self.get_paths_per_person_query() + person_path_filter = self.get_person_path_filter() + paths_funnel_cte = "" + + if self.should_query_funnel(): + paths_funnel_cte = self.get_path_query_funnel_cte(cast(Filter, self._funnel_filter)) + + self.params["limit"] = self._filter.limit + self.params["offset"] = self._filter.offset + + return f""" + {paths_funnel_cte} + SELECT DISTINCT person_id + FROM ( + {paths_per_person_query} + ) + WHERE {person_path_filter} + ORDER BY person_id + LIMIT %(limit)s + OFFSET %(offset)s + """ + + def get_person_path_filter(self) -> str: + conditions = [] + + if self._filter.path_dropoff_key: + conditions.append("path_dropoff_key = %(path_dropoff_key)s") + self.params["path_dropoff_key"] = self._filter.path_dropoff_key + else: + if self._filter.path_start_key: + conditions.append("last_path_key = %(path_start_key)s") + self.params["path_start_key"] = self._filter.path_start_key + + if self._filter.path_end_key: + conditions.append("path_key = %(path_end_key)s") + self.params["path_end_key"] = self._filter.path_end_key + + if conditions: + return " AND ".join(conditions) + + return "1=1" + + def _format_results(self, results): + people = Person.objects.filter(team_id=self._team.pk, uuid__in=[val[0] for val in results]) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data, len(results) > cast(int, self._filter.limit) - 1 + + def run(self): + results = self._exec_query() + return self._format_results(results) diff --git a/ee/clickhouse/queries/person_query.py b/ee/clickhouse/queries/person_query.py new file mode 100644 index 0000000000000..980497dc70bce --- /dev/null +++ b/ee/clickhouse/queries/person_query.py @@ -0,0 +1,119 @@ +from typing import Dict, List, Optional, Set, Tuple, Union + +from ee.clickhouse.materialized_columns.columns import ColumnName +from ee.clickhouse.models.property import extract_tables_and_properties, prop_filter_json_extract +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from posthog.models import Filter +from posthog.models.entity import Entity +from posthog.models.filters.path_filter import PathFilter +from posthog.models.filters.retention_filter import RetentionFilter +from posthog.models.property import Property + + +class ClickhousePersonQuery: + """ + Query class responsible for joining with `person` clickhouse table + + For sake of performance, this class: + - Tries to do as much person property filtering as possible here + - Minimizes the amount of columns read + """ + + PERSON_PROPERTIES_ALIAS = "person_props" + ALIASES = {"properties": "person_props"} + + _filter: Union[Filter, PathFilter, RetentionFilter] + _team_id: int + _column_optimizer: ColumnOptimizer + _extra_fields: Set[ColumnName] + + def __init__( + self, + filter: Union[Filter, PathFilter, RetentionFilter], + team_id: int, + column_optimizer: Optional[ColumnOptimizer] = None, + *, + entity: Optional[Entity] = None, + extra_fields: List[ColumnName] = [], + ) -> None: + self._filter = filter + self._team_id = team_id + self._entity = entity + self._column_optimizer = column_optimizer or ColumnOptimizer(self._filter, self._team_id) + self._extra_fields = set(extra_fields) + + if self.PERSON_PROPERTIES_ALIAS in self._extra_fields: + self._extra_fields = self._extra_fields - {self.PERSON_PROPERTIES_ALIAS} | {"properties"} + + def get_query(self) -> Tuple[str, Dict]: + fields = "id" + " ".join( + f", argMax({column_name}, _timestamp) as {alias}" for column_name, alias in self._get_fields() + ) + + person_filters, params = self._get_person_filters() + + return ( + f""" + SELECT {fields} + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 {person_filters} + """, + params, + ) + + @property + def fields(self) -> List[ColumnName]: + "Returns person table fields this query exposes" + return [alias for column_name, alias in self._get_fields()] + + @property + def is_used(self): + "Returns whether properties or any other columns are actually being queried" + if any(self._uses_person_id(prop) for prop in self._filter.properties): + return True + if any(self._uses_person_id(prop) for entity in self._filter.entities for prop in entity.properties): + return True + + return len(self._column_optimizer.person_columns_to_query) > 0 + + def _uses_person_id(self, prop: Property) -> bool: + return prop.type in ("person", "static-cohort", "precalculated-cohort") + + def _get_fields(self) -> List[Tuple[str, str]]: + # :TRICKY: Figure out what fields we want to expose - minimizing this set is good for performance. + # We use the result from column_optimizer to figure out counts of all properties to be filtered and queried. + # Here, we remove the ones only to be used for filtering. + # The same property might be present for both querying and filtering, and hence the Counter. + properties_to_query = self._column_optimizer._used_properties_with_type("person") + properties_to_query -= extract_tables_and_properties(self._filter.properties) + + if self._entity is not None: + properties_to_query -= extract_tables_and_properties(self._entity.properties) + + columns = self._column_optimizer.columns_to_query("person", set(properties_to_query)) | set(self._extra_fields) + + return [(column_name, self.ALIASES.get(column_name, column_name)) for column_name in sorted(columns)] + + def _get_person_filters(self) -> Tuple[str, Dict]: + conditions, params = [""], {} + + properties = self._filter.properties + (self._entity.properties if self._entity else []) + + for index, property in enumerate(properties): + if property.type != "person": + continue + + expr, prop_params = prop_filter_json_extract( + property, + index, + prepend="personquery", + allow_denormalized_props=True, + transform_expression=lambda column_name: f"argMax(person.{column_name}, _timestamp)", + ) + + conditions.append(expr) + params.update(prop_params) + + return " ".join(conditions), params diff --git a/ee/clickhouse/queries/retention/retention_event_query.py b/ee/clickhouse/queries/retention/retention_event_query.py new file mode 100644 index 0000000000000..0dec3a2180b03 --- /dev/null +++ b/ee/clickhouse/queries/retention/retention_event_query.py @@ -0,0 +1,123 @@ +from typing import Any, Dict, Tuple + +from ee.clickhouse.models.action import format_action_filter +from ee.clickhouse.models.group import get_aggregation_target_field +from ee.clickhouse.queries.event_query import ClickhouseEventQuery +from ee.clickhouse.queries.util import get_trunc_func_ch +from posthog.constants import ( + PAGEVIEW_EVENT, + TREND_FILTER_TYPE_ACTIONS, + TREND_FILTER_TYPE_EVENTS, + TRENDS_LINEAR, + RetentionQueryType, +) +from posthog.models import Entity +from posthog.models.action import Action +from posthog.models.filters.retention_filter import RetentionFilter + + +class RetentionEventsQuery(ClickhouseEventQuery): + _filter: RetentionFilter + _event_query_type: RetentionQueryType + _trunc_func: str + + def __init__(self, event_query_type: RetentionQueryType, *args, **kwargs): + self._event_query_type = event_query_type + super().__init__(*args, **kwargs) + + self._trunc_func = get_trunc_func_ch(self._filter.period) + + def get_query(self) -> Tuple[str, Dict[str, Any]]: + _fields = [ + self.get_timestamp_field(), + f"{get_aggregation_target_field(self._filter.aggregation_group_type_index, self.EVENT_TABLE_ALIAS, self.DISTINCT_ID_TABLE_ALIAS)} as target", + ( + f"argMin(e.uuid, {self._trunc_func}(e.timestamp)) as min_uuid" + if self._event_query_type == RetentionQueryType.TARGET_FIRST_TIME + else f"{self.EVENT_TABLE_ALIAS}.uuid AS uuid" + ), + ( + f"argMin(e.event, {self._trunc_func}(e.timestamp)) as min_event" + if self._event_query_type == RetentionQueryType.TARGET_FIRST_TIME + else f"{self.EVENT_TABLE_ALIAS}.event AS event" + ), + ] + _fields = list(filter(None, _fields)) + + date_query, date_params = self._get_date_filter() + self.params.update(date_params) + + prop_filters = [*self._filter.properties] + prop_query, prop_params = self._get_props(prop_filters) + self.params.update(prop_params) + + entity_query, entity_params = self._get_entity_query( + entity=self._filter.target_entity + if self._event_query_type == RetentionQueryType.TARGET + or self._event_query_type == RetentionQueryType.TARGET_FIRST_TIME + else self._filter.returning_entity + ) + self.params.update(entity_params) + + person_query, person_params = self._get_person_query() + self.params.update(person_params) + + groups_query, groups_params = self._get_groups_query() + self.params.update(groups_params) + + query = f""" + SELECT {','.join(_fields)} FROM events {self.EVENT_TABLE_ALIAS} + {self._get_disintct_id_query()} + {person_query} + {groups_query} + WHERE team_id = %(team_id)s + {f"AND {entity_query}"} + {f"AND {date_query}" if self._event_query_type != RetentionQueryType.TARGET_FIRST_TIME else ''} + {prop_query} + {f"GROUP BY target HAVING {date_query}" if self._event_query_type == RetentionQueryType.TARGET_FIRST_TIME else ''} + """ + + return query, self.params + + def get_timestamp_field(self) -> str: + if self._event_query_type == RetentionQueryType.TARGET: + return f"DISTINCT {self._trunc_func}({self.EVENT_TABLE_ALIAS}.timestamp) AS event_date" + elif self._event_query_type == RetentionQueryType.TARGET_FIRST_TIME: + return f"min({self._trunc_func}(e.timestamp)) as event_date" + else: + return f"{self.EVENT_TABLE_ALIAS}.timestamp AS event_date" + + def _determine_should_join_distinct_ids(self) -> None: + self._should_join_distinct_ids = True + + def _get_entity_query(self, entity: Entity): + prepend = self._event_query_type + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = Action.objects.get(pk=entity.id) + action_query, params = format_action_filter(action, prepend=prepend, use_loop=False) + condition = action_query + elif entity.type == TREND_FILTER_TYPE_EVENTS: + condition = f"{self.EVENT_TABLE_ALIAS}.event = %({prepend}_event)s" + params = {f"{prepend}_event": entity.id} + else: + condition = f"{self.EVENT_TABLE_ALIAS}.event = %({prepend}_event)s" + params = {f"{prepend}_event": PAGEVIEW_EVENT} + return condition, params + + def _get_date_filter(self): + query = ( + f"event_date >= toDateTime(%({self._event_query_type}_start_date)s) AND event_date <= toDateTime(%({self._event_query_type}_end_date)s)" + if self._event_query_type == RetentionQueryType.TARGET_FIRST_TIME + else f"toDateTime({self.EVENT_TABLE_ALIAS}.timestamp) >= toDateTime(%({self._event_query_type}_start_date)s) AND toDateTime({self.EVENT_TABLE_ALIAS}.timestamp) <= toDateTime(%({self._event_query_type}_end_date)s)" + ) + params = { + f"{self._event_query_type}_start_date": self._filter.date_from.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if self._filter.period == "Hour" else " 00:00:00") + ), + f"{self._event_query_type}_end_date": ( + (self._filter.date_from + self._filter.period_increment) + if self._filter.display == TRENDS_LINEAR and self._event_query_type == RetentionQueryType.TARGET + else self._filter.date_to + ).strftime("%Y-%m-%d{}".format(" %H:%M:%S" if self._filter.period == "Hour" else " 00:00:00")), + } + return query, params diff --git a/ee/clickhouse/queries/session_recordings/__init__.py b/ee/clickhouse/queries/session_recordings/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/ee/clickhouse/queries/session_recordings/clickhouse_session_recording.py b/ee/clickhouse/queries/session_recordings/clickhouse_session_recording.py new file mode 100644 index 0000000000000..b428ee6cc7550 --- /dev/null +++ b/ee/clickhouse/queries/session_recordings/clickhouse_session_recording.py @@ -0,0 +1,31 @@ +import json +from typing import List + +from ee.clickhouse.client import sync_execute +from posthog.models import SessionRecordingEvent +from posthog.queries.session_recordings.session_recording import SessionRecording + + +class ClickhouseSessionRecording(SessionRecording): + _recording_snapshot_query = """ + SELECT session_id, distinct_id, timestamp, snapshot_data + FROM session_recording_events + WHERE + team_id = %(team_id)s + AND session_id = %(session_id)s + ORDER BY timestamp + """ + + def _query_recording_snapshots(self) -> List[SessionRecordingEvent]: + response = sync_execute( + self._recording_snapshot_query, {"team_id": self._team.id, "session_id": self._session_recording_id,}, + ) + return [ + SessionRecordingEvent( + session_id=session_id, + distinct_id=distinct_id, + timestamp=timestamp, + snapshot_data=json.loads(snapshot_data), + ) + for session_id, distinct_id, timestamp, snapshot_data in response + ] diff --git a/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py b/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py new file mode 100644 index 0000000000000..f6adf8164218b --- /dev/null +++ b/ee/clickhouse/queries/session_recordings/clickhouse_session_recording_list.py @@ -0,0 +1,119 @@ +from typing import Any, Dict, List, NamedTuple, Tuple + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.action import format_entity_filter +from ee.clickhouse.models.entity import get_entity_filtering_params +from ee.clickhouse.models.property import parse_prop_clauses +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from posthog.models.entity import Entity +from posthog.models.filters.session_recordings_filter import SessionRecordingsFilter +from posthog.queries.session_recordings.session_recording_list import ( + EventsQueryWithAggregateClausesSQL, + SessionRecordingList, + SessionRecordingQueryResult, +) + + +class EventFiltersSQL(NamedTuple): + event_select_clause: str + event_where_clause: str + aggregate_select_clause: str + aggregate_where_clause: str + params: Dict[str, Any] + + +class ClickhouseSessionRecordingList(SessionRecordingList): + _recording_duration_select_statement = ( + "dateDiff('second', toDateTime(MIN(timestamp)), toDateTime(MAX(timestamp))) as duration," + ) + _recording_full_snapshot_select_statement = "COUNT((JSONExtractInt(snapshot_data, 'type') = 2 OR JSONExtractBool(snapshot_data, 'has_full_snapshot')) ? 1 : NULL) as full_snapshots" + _session_recording_event_table = "session_recording_events" + _session_recording_select_statements = """ + any(session_recordings.start_time) as start_time, + any(session_recordings.end_time) as end_time, + any(session_recordings.duration) as duration, + any(filtered_events.distinct_id) as distinct_id + """ + + _event_query = """ + SELECT + timestamp, + distinct_id + {event_filter_event_select_clause} + FROM events + WHERE + team_id = %(team_id)s + {events_timestamp_clause} + {event_filter_event_where_clause} + """ + + def _get_distinct_id_clause(self) -> Tuple[Dict[str, Any], str]: + distinct_id_clause = "" + distinct_id_params = {} + if self._filter.person_uuid: + distinct_id_clause = f"AND distinct_id IN (SELECT distinct_id from person_distinct_id WHERE person_id = %(person_uuid)s AND team_id = %(team_id)s)" + distinct_id_params = {"person_uuid": self._filter.person_uuid, "team_id": self._team.pk} + return distinct_id_params, distinct_id_clause + + def _get_events_query_with_aggregate_clauses(self) -> EventsQueryWithAggregateClausesSQL: + event_filters = format_event_filters(self._filter) + events_timestamp_params, events_timestamp_clause = self._get_events_timestamp_clause() + event_query = self._event_query.format( + events_timestamp_clause=events_timestamp_clause, + event_filter_event_select_clause=event_filters.event_select_clause, + event_filter_event_where_clause=event_filters.event_where_clause, + ) + + params: Dict[str, Any] = {"team_id": self._team.pk, **events_timestamp_params, **event_filters.params} + + return EventsQueryWithAggregateClausesSQL( + event_query, params, event_filters.aggregate_select_clause, event_filters.aggregate_where_clause, + ) + + def _data_to_return(self, results: List[Any]) -> List[Dict[str, Any]]: + return [dict(zip(["session_id", "start_time", "end_time", "duration", "distinct_id"], row)) for row in results] + + def run(self, *args, **kwargs) -> SessionRecordingQueryResult: + query, query_params = self._build_query() + query_results = sync_execute(query, query_params) + session_recordings = self._data_to_return(query_results) + return self._paginate_results(session_recordings) + + +def format_event_filters(filter: SessionRecordingsFilter) -> EventFiltersSQL: + if len(filter.entities) == 0: + return EventFiltersSQL("", "", "", "", {}) + + event_select_clause = "" + aggregate_select_clause = "" + aggregate_where_conditions = [] + event_where_conditions = [] + + params: Dict = {} + + for index, entity in enumerate(filter.entities): + condition_sql, filter_params = format_event_filter(entity, prepend=f"event_matcher_{index}") + event_select_clause += f", if({condition_sql}, 1, 0) as event_match_{index}" + aggregate_select_clause += f", sum(event_match_{index}) as count_event_match_{index}" + aggregate_where_conditions.append(f"count_event_match_{index} > 0") + event_where_conditions.append(condition_sql) + params = {**params, **filter_params} + + aggregate_where_clause = f"WHERE {' AND '.join(aggregate_where_conditions)}" + event_where_clause = f"AND ({' OR '.join(event_where_conditions)})" + + return EventFiltersSQL( + event_select_clause, event_where_clause, aggregate_select_clause, aggregate_where_clause, params, + ) + + +def format_event_filter(entity: Entity, prepend: str): + filter_sql, params = format_entity_filter(entity, prepend=prepend, filter_by_team=False) + if entity.properties: + filters, filter_params = parse_prop_clauses( + entity.properties, prepend=prepend, team_id=None, allow_denormalized_props=False + ) + filter_sql += f" {filters}" + params = {**params, **filter_params} + + return filter_sql, params diff --git a/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording.py b/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording.py new file mode 100644 index 0000000000000..da2872f1a021b --- /dev/null +++ b/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording.py @@ -0,0 +1,16 @@ +from uuid import uuid4 + +from ee.clickhouse.models.session_recording_event import create_session_recording_event +from ee.clickhouse.queries.session_recordings.clickhouse_session_recording import ClickhouseSessionRecording +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.queries.session_recordings.test.test_session_recording import factory_session_recording_test + + +def _create_session_recording_event(**kwargs): + create_session_recording_event( + uuid=uuid4(), **kwargs, + ) + + +class TestClickhouseSessionRecording(ClickhouseTestMixin, factory_session_recording_test(ClickhouseSessionRecording, _create_session_recording_event)): # type: ignore + pass diff --git a/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py b/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py new file mode 100644 index 0000000000000..c18da2dd2eed6 --- /dev/null +++ b/ee/clickhouse/queries/session_recordings/test/test_clickhouse_session_recording_list.py @@ -0,0 +1,23 @@ +from uuid import uuid4 + +from ee.clickhouse.models.action import Action, ActionStep +from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.session_recording_event import create_session_recording_event +from ee.clickhouse.queries.session_recordings.clickhouse_session_recording_list import ClickhouseSessionRecordingList +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.queries.session_recordings.test.test_session_recording_list import factory_session_recordings_list_test + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +def _create_session_recording_event(**kwargs): + create_session_recording_event( + uuid=uuid4(), **kwargs, + ) + + +class TestClickhouseSessionRecordingsList(ClickhouseTestMixin, factory_session_recordings_list_test(ClickhouseSessionRecordingList, _create_event, _create_session_recording_event, Action.objects.create, ActionStep.objects.create)): # type: ignore + pass diff --git a/ee/clickhouse/queries/sessions/average.py b/ee/clickhouse/queries/sessions/average.py index 7a5d16fac0b4c..5a940b17dbb3c 100644 --- a/ee/clickhouse/queries/sessions/average.py +++ b/ee/clickhouse/queries/sessions/average.py @@ -1,47 +1,66 @@ from typing import List from dateutil.relativedelta import relativedelta -from django.utils import timezone from ee.clickhouse.client import sync_execute from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.util import get_interval_annotation_ch, get_time_diff, parse_timestamps +from ee.clickhouse.queries.sessions.util import entity_query_conditions +from ee.clickhouse.queries.util import ( + format_ch_timestamp, + get_earliest_timestamp, + get_interval_func_ch, + get_trunc_func_ch, + parse_timestamps, +) from ee.clickhouse.sql.events import NULL_SQL from ee.clickhouse.sql.sessions.average_all import AVERAGE_SQL from ee.clickhouse.sql.sessions.average_per_period import AVERAGE_PER_PERIOD_SQL from ee.clickhouse.sql.sessions.no_events import SESSIONS_NO_EVENTS_SQL from posthog.models import Filter, Team +from posthog.queries.sessions.sessions import scale_time_series from posthog.utils import append_data, friendly_time class ClickhouseSessionsAvg: def calculate_avg(self, filter: Filter, team: Team): - parsed_date_from, parsed_date_to = parse_timestamps(filter) + parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter, team.pk) - filters, params = parse_prop_clauses(filter.properties, team.pk) + filters, params = parse_prop_clauses(filter.properties, team.pk, has_person_id_joined=False) - interval_notation = get_interval_annotation_ch(filter.interval) - num_intervals, seconds_in_interval = get_time_diff(filter.interval or "day", filter.date_from, filter.date_to) + trunc_func = get_trunc_func_ch(filter.interval) + interval_func = get_interval_func_ch(filter.interval) + + entity_conditions, entity_params = entity_query_conditions(filter, team) + if not entity_conditions: + entity_conditions = ["event != '$feature_flag_called'"] # default conditino + + params = {**params, **entity_params, **date_params} + entity_query = " OR ".join(entity_conditions) avg_query = SESSIONS_NO_EVENTS_SQL.format( - team_id=team.pk, date_from=parsed_date_from, date_to=parsed_date_to, filters=filters, sessions_limit="", + team_id=team.pk, + date_from=parsed_date_from, + date_to=parsed_date_to, + filters=filters, + sessions_limit="", + entity_filter=f"AND ({entity_query})", ) - per_period_query = AVERAGE_PER_PERIOD_SQL.format(sessions=avg_query, interval=interval_notation) + per_period_query = AVERAGE_PER_PERIOD_SQL.format(sessions=avg_query, interval=trunc_func) - null_sql = NULL_SQL.format( - date_to=filter.date_to.strftime("%Y-%m-%d 00:00:00"), - interval=interval_notation, - num_intervals=num_intervals, - seconds_in_interval=seconds_in_interval, - ) + null_sql = NULL_SQL.format(trunc_func=trunc_func, interval_func=interval_func,) final_query = AVERAGE_SQL.format(sessions=per_period_query, null_sql=null_sql) - params = {**params, "team_id": team.pk} + params["team_id"] = team.pk + params["date_from"] = format_ch_timestamp(filter.date_from or get_earliest_timestamp(team.pk), filter) + params["date_to"] = format_ch_timestamp(filter.date_to, filter) + params["interval"] = filter.interval response = sync_execute(final_query, params) values = self.clean_values(filter, response) time_series_data = append_data(values, interval=filter.interval, math=None) + scaled_data, _ = scale_time_series(time_series_data["data"]) + time_series_data.update({"data": scaled_data}) # calculate average total = sum(val[1] for val in values) @@ -67,7 +86,7 @@ def _format_avg(self, avg: float): avg_split = avg_formatted.split(" ") time_series_data = {} time_series_data.update( - {"label": "Average Duration of Session ({})".format(avg_split[1]), "count": int(avg_split[0]),} + {"label": "Average Session Length ({})".format(avg_split[1]), "count": int(avg_split[0]),} ) - time_series_data.update({"chartLabel": "Average Duration of Session (seconds)"}) + time_series_data.update({"chartLabel": "Average Session Length ({})".format(avg_split[1])}) return time_series_data diff --git a/ee/clickhouse/queries/sessions/clickhouse_sessions.py b/ee/clickhouse/queries/sessions/clickhouse_sessions.py index 10003dc9e461d..ccd0d39bf2be2 100644 --- a/ee/clickhouse/queries/sessions/clickhouse_sessions.py +++ b/ee/clickhouse/queries/sessions/clickhouse_sessions.py @@ -1,40 +1,41 @@ -from typing import Any, Dict, List +from datetime import datetime +from typing import Any, Dict, List, Union, cast from dateutil.relativedelta import relativedelta from django.utils import timezone from ee.clickhouse.queries.sessions.average import ClickhouseSessionsAvg from ee.clickhouse.queries.sessions.distribution import ClickhouseSessionsDist -from ee.clickhouse.queries.sessions.list import SESSIONS_LIST_DEFAULT_LIMIT, ClickhouseSessionsList from posthog.constants import SESSION_AVG, SESSION_DIST -from posthog.models import Filter, Team +from posthog.models import Team +from posthog.models.filters.sessions_filter import SessionsFilter from posthog.queries.base import BaseQuery, convert_to_comparison, determine_compared_filter from posthog.utils import relative_date_parse -class ClickhouseSessions(BaseQuery, ClickhouseSessionsList, ClickhouseSessionsAvg, ClickhouseSessionsDist): - def _set_default_dates(self, filter: Filter) -> None: - # format default dates - if filter.session_type != SESSION_AVG and filter.session_type != SESSION_DIST: - if not filter._date_from: - filter._date_from = timezone.now().replace(hour=0, minute=0, second=0, microsecond=0) - if not filter._date_to and filter.date_from: - filter._date_to = filter.date_from + relativedelta(days=1) - else: - if not filter._date_from: - filter._date_from = relative_date_parse("-7d") - if not filter._date_to: - filter._date_to = timezone.now() - - def run(self, filter: Filter, team: Team, *args, **kwargs) -> List[Dict[str, Any]]: - limit = kwargs.get("limit", SESSIONS_LIST_DEFAULT_LIMIT) - offset = kwargs.get("offset", 0) - +def set_default_dates(filter: SessionsFilter) -> SessionsFilter: + data = {} + if filter.session != SESSION_AVG and filter.session != SESSION_DIST: + date_from = filter.date_from + if not filter._date_from: + date_from = timezone.now().replace(hour=0, minute=0, second=0, microsecond=0) + data.update({"date_from": date_from}) + if not filter._date_to: + data.update({"date_to": cast(datetime, date_from) + relativedelta(days=1)}) + else: + if not filter._date_from: + data.update({"date_from": relative_date_parse("-7d")}) + if not filter._date_to: + data.update({"date_to": timezone.now()}) + return filter.with_data({**data, "user_id": filter.user_id}) + + +class ClickhouseSessions(BaseQuery, ClickhouseSessionsAvg, ClickhouseSessionsDist): + def run(self, filter: SessionsFilter, team: Team, *args, **kwargs) -> List[Dict[str, Any]]: result: List = [] - self._set_default_dates(filter) - if filter.session_type == SESSION_AVG: - + filter = set_default_dates(filter) + if filter.session == SESSION_AVG: if filter.compare: current_response = self.calculate_avg(filter, team) parsed_response = convert_to_comparison(current_response, filter, "current") @@ -47,9 +48,7 @@ def run(self, filter: Filter, team: Team, *args, **kwargs) -> List[Dict[str, Any else: result = self.calculate_avg(filter, team) - elif filter.session_type == SESSION_DIST: + elif filter.session == SESSION_DIST: result = self.calculate_dist(filter, team) - else: - result = self.calculate_list(filter, team, limit, offset) return result diff --git a/ee/clickhouse/queries/sessions/distribution.py b/ee/clickhouse/queries/sessions/distribution.py index d57860c15ca1a..36eaadea97eaf 100644 --- a/ee/clickhouse/queries/sessions/distribution.py +++ b/ee/clickhouse/queries/sessions/distribution.py @@ -1,5 +1,6 @@ from ee.clickhouse.client import sync_execute from ee.clickhouse.models.property import parse_prop_clauses +from ee.clickhouse.queries.sessions.util import entity_query_conditions from ee.clickhouse.queries.util import parse_timestamps from ee.clickhouse.sql.sessions.distribution import DIST_SQL from posthog.models import Filter, Team @@ -7,20 +8,29 @@ class ClickhouseSessionsDist: def calculate_dist(self, filter: Filter, team: Team): - from posthog.queries.sessions import DIST_LABELS + from posthog.queries.sessions.sessions import DIST_LABELS - parsed_date_from, parsed_date_to = parse_timestamps(filter) + parsed_date_from, parsed_date_to, date_params = parse_timestamps(filter, team.pk) + + filters, params = parse_prop_clauses(filter.properties, team.pk, has_person_id_joined=False) + + entity_conditions, entity_params = entity_query_conditions(filter, team) + if not entity_conditions: + return [] + + params = {**params, **entity_params} + entity_query = " OR ".join(entity_conditions) - filters, params = parse_prop_clauses(filter.properties, team.pk) dist_query = DIST_SQL.format( team_id=team.pk, date_from=parsed_date_from, date_to=parsed_date_to, - filters=filters if filter.properties else "", + filters=filters if filters else "", sessions_limit="", + entity_filter=f"AND ({entity_query})", ) - params = {**params, "team_id": team.pk} + params = {**params, "team_id": team.pk, **date_params} result = sync_execute(dist_query, params) diff --git a/ee/clickhouse/queries/sessions/events.py b/ee/clickhouse/queries/sessions/events.py new file mode 100644 index 0000000000000..0de21843a358b --- /dev/null +++ b/ee/clickhouse/queries/sessions/events.py @@ -0,0 +1,27 @@ +from typing import Any, Dict, List, cast + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.event import ClickhouseEventSerializer +from ee.clickhouse.queries.util import parse_timestamps +from ee.clickhouse.sql.sessions.list import SESSION_EVENTS +from posthog.models import Team +from posthog.models.filters.sessions_filter import SessionEventsFilter +from posthog.queries.base import BaseQuery + + +class SessionsListEvents(BaseQuery): + def run(self, filter: SessionEventsFilter, team: Team, *args, **kwargs) -> List[Dict[str, Any]]: + date_from, date_to, date_params = parse_timestamps(filter, team.pk) + + raw_events = sync_execute( + SESSION_EVENTS.format(date_from=date_from, date_to=date_to), + {"team_id": team.pk, "distinct_id": filter.distinct_id, **date_params}, + ) + + return self._serialize(raw_events, cast(str, filter.distinct_id), team.pk) + + def _serialize(self, events: List[List[Any]], distinct_id: str, team_id: int) -> List[Dict]: + data = [] + for uuid, event, properties, timestamp, elements_chain in events: + data.append([uuid, event, properties, timestamp, team_id, None, distinct_id, elements_chain, None, None]) + return cast(List[Dict[str, Any]], ClickhouseEventSerializer(data, many=True, context={"people": None}).data) diff --git a/ee/clickhouse/queries/sessions/list.py b/ee/clickhouse/queries/sessions/list.py index ef517a0799588..b8cb48e5c3c87 100644 --- a/ee/clickhouse/queries/sessions/list.py +++ b/ee/clickhouse/queries/sessions/list.py @@ -1,35 +1,104 @@ -from typing import Dict, List, Tuple +from collections import namedtuple +from typing import Any, Dict, List, Optional, Tuple from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.action import format_entity_filter from ee.clickhouse.models.event import ClickhouseEventSerializer from ee.clickhouse.models.person import get_persons_by_distinct_ids from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.clickhouse_session_recording import add_session_recording_ids +from ee.clickhouse.queries.clickhouse_session_recording import join_with_session_recordings +from ee.clickhouse.queries.sessions.clickhouse_sessions import set_default_dates from ee.clickhouse.queries.util import parse_timestamps -from ee.clickhouse.sql.sessions.list import SESSION_SQL -from posthog.models import Filter, Person, Team +from ee.clickhouse.sql.sessions.list import SESSION_SQL, SESSIONS_DISTINCT_ID_SQL +from posthog.models import Entity, Person +from posthog.models.filters.sessions_filter import SessionsFilter +from posthog.queries.sessions.sessions_list import SessionsList +from posthog.utils import flatten -SESSIONS_LIST_DEFAULT_LIMIT = 50 +Session = Dict +ActionFiltersSQL = namedtuple( + "ActionFiltersSQL", ["select_clause", "matches_action_clauses", "filters_having", "matches_any_clause", "params"] +) -class ClickhouseSessionsList: - def calculate_list(self, filter: Filter, team: Team, limit: int, offset: int): - filters, params = parse_prop_clauses(filter.properties, team.pk) +class ClickhouseSessionsList(SessionsList): + def fetch_page(self) -> Tuple[List[Session], Optional[Dict]]: + limit = self.limit + 1 + self.filter = set_default_dates(self.filter) # type: ignore + offset = self.filter.pagination.get("offset", 0) + distinct_id_offset = self.filter.pagination.get("distinct_id_offset", 0) + + action_filters = format_action_filters(self.filter) + + date_from, date_to, date_params = parse_timestamps(self.filter, self.team.pk) + distinct_ids = self.fetch_distinct_ids( + action_filters, date_from, date_to, date_params, limit, distinct_id_offset + ) - date_from, date_to = parse_timestamps(filter) - params = {**params, "team_id": team.pk, "limit": limit, "offset": offset, "distinct_id_limit": limit + offset} query = SESSION_SQL.format( - date_from=date_from, date_to=date_to, filters=filters, sessions_limit="LIMIT %(offset)s, %(limit)s", + date_from=date_from, + date_to=date_to, + filters_select_clause=action_filters.select_clause, + matches_action_clauses=action_filters.matches_action_clauses, + filters_having=action_filters.filters_having, + sessions_limit="LIMIT %(offset)s, %(limit)s", + ) + query_result = sync_execute( + query, + { + **action_filters.params, + "team_id": self.team.pk, + "limit": limit, + "offset": offset, + "distinct_ids": distinct_ids, + **date_params, + }, ) - query_result = sync_execute(query, params) result = self._parse_list_results(query_result) - self._add_person_properties(team, result) - add_session_recording_ids(team, result) + pagination = None + if len(distinct_ids) >= limit + distinct_id_offset or len(result) == limit: + if len(result) == limit: + result.pop() + pagination = {"offset": offset + len(result), "distinct_id_offset": distinct_id_offset + limit} + + self._add_person_properties(result) + + return join_with_session_recordings(self.team, result, self.filter), pagination - return result + def fetch_distinct_ids( + self, + action_filters: ActionFiltersSQL, + date_from: str, + date_to: str, + date_params: Dict[str, Any], + limit: int, + distinct_id_offset: int, + ) -> List[str]: + if self.filter.distinct_id: + persons = get_persons_by_distinct_ids(self.team.pk, [self.filter.distinct_id]) + return persons[0].distinct_ids if len(persons) > 0 else [] - def _add_person_properties(self, team=Team, sessions=List[Tuple]): + person_filters, person_filter_params = parse_prop_clauses( + self.filter.person_filter_properties, self.team.pk, allow_denormalized_props=False + ) + return sync_execute( + SESSIONS_DISTINCT_ID_SQL.format( + date_from=date_from, + date_to=date_to, + person_filters=person_filters, + action_filters=action_filters.matches_any_clause, + ), + { + **person_filter_params, + **action_filters.params, + "team_id": self.team.pk, + "distinct_id_limit": distinct_id_offset + limit, + **date_params, + }, + ) + + def _add_person_properties(self, sessions: List[Session]): distinct_id_hash = {} for session in sessions: distinct_id_hash[session["distinct_id"]] = True @@ -38,7 +107,7 @@ def _add_person_properties(self, team=Team, sessions=List[Tuple]): if len(distinct_ids) == 0: return - persons = get_persons_by_distinct_ids(team.pk, distinct_ids) + persons = get_persons_by_distinct_ids(self.team.pk, distinct_ids) distinct_to_person: Dict[str, Person] = {} for person in persons: @@ -47,37 +116,68 @@ def _add_person_properties(self, team=Team, sessions=List[Tuple]): for session in sessions: if distinct_to_person.get(session["distinct_id"], None): - session["properties"] = distinct_to_person[session["distinct_id"]].properties + session["email"] = distinct_to_person[session["distinct_id"]].properties.get("email") def _parse_list_results(self, results: List[Tuple]): - final = [] - for result in results: - events = [] - for i in range(len(result[4])): - event = [ - result[4][i], # uuid - result[5][i], # event - result[6][i], # properties - result[7][i], # timestamp - None, # team_id, - result[0], # distinct_id - result[8][i], # elements_chain - None, # properties keys - None, # properties values - ] - events.append(ClickhouseEventSerializer(event, many=False).data) - - final.append( - { - "distinct_id": result[0], - "global_session_id": result[1], - "length": result[2], - "start_time": result[3], - "end_time": result[9], - "event_count": len(result[4]), - "events": list(events), - "properties": {}, - } - ) - - return final + return [ + { + "distinct_id": result[0], + "global_session_id": result[1], + "length": result[2], + "start_time": result[3], + "end_time": result[4], + "start_url": _process_url(result[5]), + "end_url": _process_url(result[6]), + "matching_events": list(sorted(set(flatten(result[7:])))), + } + for result in results + ] + + +def format_action_filters(filter: SessionsFilter) -> ActionFiltersSQL: + if len(filter.action_filters) == 0: + return ActionFiltersSQL("", "", "", "", {}) + + matches_action_clauses = select_clause = "" + having_clause = [] + matches_any_clause = [] + + params: Dict = {} + + for index, entity in enumerate(filter.action_filters): + condition_sql, filter_params = format_action_filter_aggregate(entity, prepend=f"event_matcher_{index}") + + matches_action_clauses += f", ({condition_sql}) ? uuid : NULL as event_match_{index}" + select_clause += f", groupArray(event_match_{index}) as event_match_{index}" + having_clause.append(f"notEmpty(event_match_{index})") + matches_any_clause.append(condition_sql) + + params = {**params, **filter_params} + + return ActionFiltersSQL( + select_clause, + matches_action_clauses, + f"HAVING {' AND '.join(having_clause)}", + f"AND ({' OR '.join(matches_any_clause)})", + params, + ) + + +def format_action_filter_aggregate(entity: Entity, prepend: str): + filter_sql, params = format_entity_filter(entity, prepend=prepend, filter_by_team=False) + if entity.properties: + filters, filter_params = parse_prop_clauses( + entity.properties, prepend=prepend, team_id=None, allow_denormalized_props=False, has_person_id_joined=False + ) + filter_sql += f" {filters}" + params = {**params, **filter_params} + + return filter_sql, params + + +def _process_url(url: Optional[str]) -> Optional[str]: + if url is not None: + url = url.strip('"') + if url == "": + url = None + return url diff --git a/ee/clickhouse/queries/sessions/util.py b/ee/clickhouse/queries/sessions/util.py new file mode 100644 index 0000000000000..43330c371b967 --- /dev/null +++ b/ee/clickhouse/queries/sessions/util.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, List, Tuple + +from ee.clickhouse.models.action import format_action_filter +from posthog.constants import TREND_FILTER_TYPE_ACTIONS +from posthog.models import Action, Entity, Filter, Team + + +def event_entity_to_query(entity: Entity, team: Team, prepend="event_entity") -> Tuple[str, Dict]: + event_query = "event = %({})s ".format(prepend) + params = {prepend: entity.id} + + if entity.properties: + from ee.clickhouse.models.property import parse_prop_clauses + + prop_query, prop_params = parse_prop_clauses( + entity.properties, + team_id=team.pk, + prepend="{}_props".format(prepend), + allow_denormalized_props=False, + has_person_id_joined=False, + ) + event_query += prop_query + params = {**params, **prop_params} + + return f"({event_query})", params + + +def entity_query_conditions(filter: Filter, team: Team) -> Tuple[List[str], Dict]: + entity_conditions = [] + params: Dict[str, Any] = {} + for index, entity in enumerate(filter.entities): + if entity.type == TREND_FILTER_TYPE_ACTIONS: + action = entity.get_action() + action_query, action_params = format_action_filter(action, prepend=f"action_{index}") + entity_conditions.append(action_query) + params = {**params, **action_params} + else: + event_query, event_params = event_entity_to_query(entity, team, prepend=f"event_{index}") + entity_conditions.append(event_query) + params = {**params, **event_params} + + return entity_conditions, params diff --git a/ee/clickhouse/queries/test/__snapshots__/test_breakdown_props.ambr b/ee/clickhouse/queries/test/__snapshots__/test_breakdown_props.ambr new file mode 100644 index 0000000000000..88983b0075ae9 --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_breakdown_props.ambr @@ -0,0 +1,108 @@ +# name: TestBreakdownProps.test_breakdown_group_props + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = '$pageview' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-12 23:59:59' + AND (isNull(trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'out'))) + OR NOT JSONHas(group_properties_0, 'out')) + GROUP BY value + ORDER BY count DESC + LIMIT 5 + OFFSET 0) + ' +--- +# name: TestBreakdownProps.test_breakdown_person_props + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(person_props, '$browser')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON e.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id, + argMax(properties, _timestamp) as person_props + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0) person ON pdi.person_id = person.id + WHERE team_id = 2 + AND event = '$pageview' + AND timestamp >= '2019-12-21 00:00:00' + AND timestamp <= '2020-01-04 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 5 + OFFSET 0) + ' +--- +# name: TestBreakdownProps.test_breakdown_person_props_materialized + ' + + SELECT groupArray(value) + FROM + (SELECT pmat_$browser AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON e.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id, + argMax(pmat_$browser, _timestamp) as pmat_$browser + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0) person ON pdi.person_id = person.id + WHERE team_id = 2 + AND event = '$pageview' + AND timestamp >= '2019-12-21 00:00:00' + AND timestamp <= '2020-01-04 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 5 + OFFSET 0) + ' +--- diff --git a/ee/clickhouse/queries/test/__snapshots__/test_event_query.ambr b/ee/clickhouse/queries/test/__snapshots__/test_event_query.ambr new file mode 100644 index 0000000000000..378ee0bf09025 --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_event_query.ambr @@ -0,0 +1,369 @@ +# name: TestEventQuery.test_account_filters + ' + + SELECT DISTINCT p.id + FROM + (SELECT * + FROM person + JOIN + (SELECT id, + max(_timestamp) as _timestamp, + max(is_deleted) as is_deleted + FROM person + WHERE team_id = 2 + GROUP BY id) as person_max ON person.id = person_max.id + AND person._timestamp = person_max._timestamp + WHERE team_id = 2 + AND person_max.is_deleted = 0 ) AS p + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON p.id = pdi.person_id + WHERE team_id = 2 + AND pdi.distinct_id IN + (SELECT distinct_id + FROM + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) + WHERE person_id IN + (select id + from + (SELECT * + FROM person + JOIN + (SELECT id, + max(_timestamp) as _timestamp, + max(is_deleted) as is_deleted + FROM person + WHERE team_id = 2 + GROUP BY id) as person_max ON person.id = person_max.id + AND person._timestamp = person_max._timestamp + WHERE team_id = 2 + AND person_max.is_deleted = 0 + AND (has(['Jane'], trim(BOTH '"' + FROM JSONExtractRaw(properties, 'name')))) )) ) + ' +--- +# name: TestEventQuery.test_account_filters.1 + ' + + SELECT e.timestamp as timestamp, + e.properties as properties + FROM events e + WHERE team_id = 2 + AND event = 'event_name' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-01-14 00:00:00')) + AND timestamp <= '2021-01-21 23:59:59' + AND has(['Jane'], trim(BOTH '"' + FROM JSONExtractRaw(properties, 'name'))) + AND team_id = 2 + ' +--- +# name: TestEventQuery.test_basic_event_filter + ' + + SELECT e.timestamp as timestamp + FROM events e + WHERE team_id = 2 + AND event = 'viewed' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-05-01 00:00:00')) + AND timestamp <= '2021-05-07 23:59:59' + ' +--- +# name: TestEventQuery.test_cohort_filter + ' + + SELECT e.timestamp as timestamp, + pdi.person_id as person_id + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id + WHERE team_id = 2 + AND event = 'viewed' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-05-01 00:00:00')) + AND timestamp <= '2021-05-07 23:59:59' + AND pdi.person_id IN + (select id + from + (SELECT * + FROM person + JOIN + (SELECT id, + max(_timestamp) as _timestamp, + max(is_deleted) as is_deleted + FROM person + WHERE team_id = 2 + GROUP BY id) as person_max ON person.id = person_max.id + AND person._timestamp = person_max._timestamp + WHERE team_id = 2 + AND person_max.is_deleted = 0 + AND (has(['test'], trim(BOTH '"' + FROM JSONExtractRaw(properties, 'name')))) )) + ' +--- +# name: TestEventQuery.test_denormalised_props + ' + + SELECT e.timestamp as timestamp, + e.mat_test_prop as mat_test_prop + FROM events e + WHERE team_id = 2 + AND event = 'user signed up' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-14 23:59:59' + AND has(['hi'], mat_test_prop) + AND team_id = 2 + AND has(['hi'], mat_test_prop) + AND team_id = 2 + ' +--- +# name: TestEventQuery.test_element + ' + + SELECT e.timestamp as timestamp + FROM events e + WHERE team_id = 2 + AND event = 'event_name' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-01-14 00:00:00')) + AND timestamp <= '2021-01-21 23:59:59' + AND ((match(elements_chain, '(^|;)label(\\.|$|;|:)'))) + ' +--- +# name: TestEventQuery.test_element.1 + ' + + SELECT e.timestamp as timestamp + FROM events e + WHERE team_id = 2 + AND event = 'event_name' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-01-14 00:00:00')) + AND timestamp <= '2021-01-21 23:59:59' + AND 0 = 192 + ' +--- +# name: TestEventQuery.test_entity_filtered_by_cohort + ' + + SELECT e.timestamp as timestamp, + pdi.person_id as person_id + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id + WHERE team_id = 2 + AND event = '$pageview' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-05-01 00:00:00')) + AND timestamp <= '2021-05-07 23:59:59' + AND pdi.person_id IN + (select id + from + (SELECT * + FROM person + JOIN + (SELECT id, + max(_timestamp) as _timestamp, + max(is_deleted) as is_deleted + FROM person + WHERE team_id = 2 + GROUP BY id) as person_max ON person.id = person_max.id + AND person._timestamp = person_max._timestamp + WHERE team_id = 2 + AND person_max.is_deleted = 0 + AND (has(['test'], trim(BOTH '"' + FROM JSONExtractRaw(properties, 'name')))) )) + ' +--- +# name: TestEventQuery.test_event_properties_filter + ' + + SELECT e.timestamp as timestamp, + e.properties as properties + FROM events e + WHERE team_id = 2 + AND event = 'viewed' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-05-01 00:00:00')) + AND timestamp <= '2021-05-07 23:59:59' + AND has(['test_val'], trim(BOTH '"' + FROM JSONExtractRaw(properties, 'some_key'))) + AND team_id = 2 + ' +--- +# name: TestEventQuery.test_event_properties_filter.1 + ' + + SELECT e.timestamp as timestamp + FROM events e + WHERE team_id = 2 + AND event = 'viewed' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-05-01 00:00:00')) + AND timestamp <= '2021-05-07 23:59:59' + AND has(['test_val'], trim(BOTH '"' + FROM JSONExtractRaw(properties, 'some_key'))) + AND team_id = 2 + ' +--- +# name: TestEventQuery.test_groups_filters + ' + + SELECT e.timestamp as timestamp + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_1 + FROM groups + WHERE team_id = 2 + AND group_type_index = 1 + GROUP BY group_key) groups_1 ON $group_1 == groups_1.group_key + WHERE team_id = 2 + AND event = '$pageview' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND has(['finance'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) + AND has(['value'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_1, 'another'))) + ' +--- +# name: TestEventQuery.test_groups_filters_mixed + ' + + SELECT e.timestamp as timestamp, + pdi.person_id as person_id + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0 + AND has(['test'], trim(BOTH '"' + FROM JSONExtractRaw(argMax(person.properties, _timestamp), '$browser')))) person ON person.id = pdi.person_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = '$pageview' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND has(['finance'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) + ' +--- +# name: TestEventQuery.test_static_cohort_filter + ' + + SELECT e.timestamp as timestamp, + pdi.person_id as person_id + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0) person ON person.id = pdi.person_id + WHERE team_id = 2 + AND event = 'viewed' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2021-05-01 00:00:00')) + AND timestamp <= '2021-05-07 23:59:59' + AND person_id IN + (SELECT person_id + FROM person_static_cohort + WHERE cohort_id = 2 + AND team_id = 2) + ' +--- diff --git a/ee/clickhouse/queries/test/__snapshots__/test_groups_join_query.ambr b/ee/clickhouse/queries/test/__snapshots__/test_groups_join_query.ambr new file mode 100644 index 0000000000000..ef65dda244499 --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_groups_join_query.ambr @@ -0,0 +1,21 @@ +# name: test_groups_join_query_filtering + ( + ' + + INNER JOIN ( + SELECT + group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = %(team_id)s AND group_type_index = %(group_index_0)s + GROUP BY group_key + ) groups_0 + ON $group_0 == groups_0.group_key + + ', + { + 'group_index_0': 0, + 'team_id': 2, + }, + ) +--- diff --git a/ee/clickhouse/queries/test/__snapshots__/test_person_query.ambr b/ee/clickhouse/queries/test/__snapshots__/test_person_query.ambr new file mode 100644 index 0000000000000..684996494eb4c --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_person_query.ambr @@ -0,0 +1,77 @@ +# name: test_person_query + ' + + SELECT id + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 + + ' +--- +# name: test_person_query.1 + ' + + SELECT id + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 AND argMax(person.pmat_email, _timestamp) ILIKE %(vpersonquery_1)s + + ' +--- +# name: test_person_query_with_entity_filters + ' + + SELECT id, argMax(pmat_email, _timestamp) as pmat_email + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 + + ' +--- +# name: test_person_query_with_entity_filters.1 + ' + + SELECT id + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 AND argMax(person.pmat_email, _timestamp) ILIKE %(vpersonquery_0)s + + ' +--- +# name: test_person_query_with_extra_fields + ' + + SELECT id, argMax(pmat_email, _timestamp) as pmat_email , argMax(properties, _timestamp) as person_props + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 AND argMax(person.pmat_email, _timestamp) ILIKE %(vpersonquery_0)s + + ' +--- +# name: test_person_query_with_extra_requested_fields + ' + + SELECT id, argMax(properties, _timestamp) as person_props + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 AND argMax(person.pmat_email, _timestamp) ILIKE %(vpersonquery_0)s + + ' +--- +# name: test_person_query_with_extra_requested_fields.1 + ' + + SELECT id, argMax(pmat_email, _timestamp) as pmat_email + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING max(is_deleted) = 0 AND argMax(person.pmat_email, _timestamp) ILIKE %(vpersonquery_0)s + + ' +--- diff --git a/ee/clickhouse/queries/test/__snapshots__/test_retention.ambr b/ee/clickhouse/queries/test/__snapshots__/test_retention.ambr new file mode 100644 index 0000000000000..7a061eb13ca54 --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_retention.ambr @@ -0,0 +1,439 @@ +# name: TestClickhouseRetention.test_groups_aggregating + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), reference_event.event_date) as base_interval, + datediff('Week', reference_event.event_date, toStartOfWeek(toDateTime(event_date))) as intervals_from_base, + COUNT(DISTINCT event.target) count + FROM + (SELECT e.timestamp AS event_date, + e.$group_0 as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND NOT has([''], $group_0) + AND team_id = 2 ) event + JOIN + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + e.$group_0 as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND NOT has([''], $group_0) + AND team_id = 2 ) reference_event ON (event.target = reference_event.target) + WHERE toStartOfWeek(event.event_date) > toStartOfWeek(reference_event.event_date) + GROUP BY base_interval, + intervals_from_base + ORDER BY base_interval, + intervals_from_base + ' +--- +# name: TestClickhouseRetention.test_groups_aggregating.1 + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), event_date) event_date, + count(DISTINCT target) + FROM + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + e.$group_0 as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND NOT has([''], $group_0) + AND team_id = 2 ) + GROUP BY event_date + ORDER BY event_date + ' +--- +# name: TestClickhouseRetention.test_groups_aggregating.2 + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), reference_event.event_date) as base_interval, + datediff('Week', reference_event.event_date, toStartOfWeek(toDateTime(event_date))) as intervals_from_base, + COUNT(DISTINCT event.target) count + FROM + (SELECT e.timestamp AS event_date, + e.$group_1 as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND NOT has([''], $group_1) + AND team_id = 2 ) event + JOIN + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + e.$group_1 as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND NOT has([''], $group_1) + AND team_id = 2 ) reference_event ON (event.target = reference_event.target) + WHERE toStartOfWeek(event.event_date) > toStartOfWeek(reference_event.event_date) + GROUP BY base_interval, + intervals_from_base + ORDER BY base_interval, + intervals_from_base + ' +--- +# name: TestClickhouseRetention.test_groups_aggregating.3 + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), event_date) event_date, + count(DISTINCT target) + FROM + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + e.$group_1 as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND NOT has([''], $group_1) + AND team_id = 2 ) + GROUP BY event_date + ORDER BY event_date + ' +--- +# name: TestClickhouseRetention.test_groups_filtering + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), reference_event.event_date) as base_interval, + datediff('Week', reference_event.event_date, toStartOfWeek(toDateTime(event_date))) as intervals_from_base, + COUNT(DISTINCT event.target) count + FROM + (SELECT e.timestamp AS event_date, + pdi.person_id as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND has(['technology'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) ) event + JOIN + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + pdi.person_id as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND has(['technology'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) ) reference_event ON (event.target = reference_event.target) + WHERE toStartOfWeek(event.event_date) > toStartOfWeek(reference_event.event_date) + GROUP BY base_interval, + intervals_from_base + ORDER BY base_interval, + intervals_from_base + ' +--- +# name: TestClickhouseRetention.test_groups_filtering.1 + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), event_date) event_date, + count(DISTINCT target) + FROM + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + pdi.person_id as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND has(['technology'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) ) + GROUP BY event_date + ORDER BY event_date + ' +--- +# name: TestClickhouseRetention.test_groups_filtering.2 + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), reference_event.event_date) as base_interval, + datediff('Week', reference_event.event_date, toStartOfWeek(toDateTime(event_date))) as intervals_from_base, + COUNT(DISTINCT event.target) count + FROM + (SELECT e.timestamp AS event_date, + pdi.person_id as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND JSONHas(group_properties_0, 'industry') ) event + JOIN + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + pdi.person_id as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND JSONHas(group_properties_0, 'industry') ) reference_event ON (event.target = reference_event.target) + WHERE toStartOfWeek(event.event_date) > toStartOfWeek(reference_event.event_date) + GROUP BY base_interval, + intervals_from_base + ORDER BY base_interval, + intervals_from_base + ' +--- +# name: TestClickhouseRetention.test_groups_filtering.3 + ' + + SELECT datediff('Week', toStartOfWeek(toDateTime('2020-06-07 00:00:00')), event_date) event_date, + count(DISTINCT target) + FROM + (SELECT DISTINCT toStartOfWeek(e.timestamp) AS event_date, + pdi.person_id as target, + e.uuid AS uuid, + e.event AS event + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND e.event = '$pageview' + AND toDateTime(e.timestamp) >= toDateTime('2020-06-07 00:00:00') + AND toDateTime(e.timestamp) <= toDateTime('2020-07-27 00:00:00') + AND JSONHas(group_properties_0, 'industry') ) + GROUP BY event_date + ORDER BY event_date + ' +--- diff --git a/ee/clickhouse/queries/test/__snapshots__/test_trends.ambr b/ee/clickhouse/queries/test/__snapshots__/test_trends.ambr new file mode 100644 index 0000000000000..cb9e5fe27f56d --- /dev/null +++ b/ee/clickhouse/queries/test/__snapshots__/test_trends.ambr @@ -0,0 +1,410 @@ +# name: TestClickhouseTrends.test_aggregating_by_group + ' + + SELECT groupArray(day_start) as date, + groupArray(count) as data + FROM + (SELECT SUM(total) AS count, + day_start + from + (SELECT toUInt16(0) AS total, + toStartOfDay(toDateTime('2020-01-12 23:59:59') - toIntervalDay(number)) AS day_start + FROM numbers(dateDiff('day', toDateTime('2020-01-01 00:00:00'), toDateTime('2020-01-12 23:59:59'))) + UNION ALL SELECT toUInt16(0) AS total, + toStartOfDay(toDateTime('2020-01-01 00:00:00')) + UNION ALL SELECT count(DISTINCT $group_0) as data, + toDateTime(toStartOfDay(timestamp), 'UTC') as date + FROM + (SELECT e.timestamp as timestamp, + e.$group_0 as $group_0 + FROM events e + WHERE team_id = 2 + AND event = '$pageview' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND NOT has([''], $group_0) + AND team_id = 2 ) + GROUP BY toStartOfDay(timestamp)) + group by day_start + order by day_start) + ' +--- +# name: TestClickhouseTrends.test_aggregating_by_group.1 + ' + + SELECT groupArray(day_start) as date, + groupArray(count) as data + FROM + (SELECT SUM(total) AS count, + day_start + from + (SELECT toUInt16(0) AS total, + toStartOfDay(toDateTime('2020-01-12 23:59:59') - toIntervalDay(number)) AS day_start + FROM numbers(dateDiff('day', toDateTime('2020-01-01 00:00:00'), toDateTime('2020-01-12 23:59:59'))) + UNION ALL SELECT toUInt16(0) AS total, + toStartOfDay(toDateTime('2020-01-01 00:00:00')) + UNION ALL SELECT count(DISTINCT $group_1) as data, + toDateTime(toStartOfDay(timestamp), 'UTC') as date + FROM + (SELECT e.timestamp as timestamp, + e.$group_1 as $group_1 + FROM events e + WHERE team_id = 2 + AND event = '$pageview' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND NOT has([''], $group_1) + AND team_id = 2 ) + GROUP BY toStartOfDay(timestamp)) + group by day_start + order by day_start) + ' +--- +# name: TestClickhouseTrends.test_breakdown_by_group_props + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-12 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 25 + OFFSET 0) + ' +--- +# name: TestClickhouseTrends.test_breakdown_by_group_props.1 + ' + + SELECT groupArray(day_start) as date, + groupArray(count) as data, + breakdown_value + FROM + (SELECT SUM(total) as count, + day_start, + breakdown_value + FROM + (SELECT * + FROM + (SELECT toUInt16(0) AS total, + ticks.day_start as day_start, + breakdown_value + FROM + (SELECT toStartOfDay(toDateTime('2020-01-12 23:59:59') - number * 86400) as day_start + FROM numbers(12) + UNION ALL SELECT toStartOfDay(toDateTime('2020-01-01 00:00:00')) as day_start) as ticks + CROSS JOIN + (SELECT breakdown_value + FROM + (SELECT ['finance', 'technology'] as breakdown_value) ARRAY + JOIN breakdown_value) as sec + ORDER BY breakdown_value, + day_start + UNION ALL SELECT count(*) as total, + toDateTime(toStartOfDay(timestamp), 'UTC') as day_start, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) as breakdown_value + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE e.team_id = 2 + AND event = 'sign up' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) in (['finance', 'technology']) + GROUP BY day_start, + breakdown_value)) + GROUP BY day_start, + breakdown_value + ORDER BY breakdown_value, + day_start) + GROUP BY breakdown_value + ' +--- +# name: TestClickhouseTrends.test_breakdown_by_group_props_with_person_filter + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON e.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0 + AND has(['value'], trim(BOTH '"' + FROM JSONExtractRaw(argMax(person.properties, _timestamp), 'key')))) person ON pdi.person_id = person.id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-12 23:59:59' + GROUP BY value + ORDER BY count DESC + LIMIT 25 + OFFSET 0) + ' +--- +# name: TestClickhouseTrends.test_breakdown_by_group_props_with_person_filter.1 + ' + + SELECT groupArray(day_start) as date, + groupArray(count) as data, + breakdown_value + FROM + (SELECT SUM(total) as count, + day_start, + breakdown_value + FROM + (SELECT * + FROM + (SELECT toUInt16(0) AS total, + ticks.day_start as day_start, + breakdown_value + FROM + (SELECT toStartOfDay(toDateTime('2020-01-12 23:59:59') - number * 86400) as day_start + FROM numbers(12) + UNION ALL SELECT toStartOfDay(toDateTime('2020-01-01 00:00:00')) as day_start) as ticks + CROSS JOIN + (SELECT breakdown_value + FROM + (SELECT ['finance'] as breakdown_value) ARRAY + JOIN breakdown_value) as sec + ORDER BY breakdown_value, + day_start + UNION ALL SELECT count(*) as total, + toDateTime(toStartOfDay(timestamp), 'UTC') as day_start, + trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) as breakdown_value + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) as pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0 + AND has(['value'], trim(BOTH '"' + FROM JSONExtractRaw(argMax(person.properties, _timestamp), 'key')))) person ON person.id = pdi.person_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE e.team_id = 2 + AND event = 'sign up' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry')) in (['finance']) + GROUP BY day_start, + breakdown_value)) + GROUP BY day_start, + breakdown_value + ORDER BY breakdown_value, + day_start) + GROUP BY breakdown_value + ' +--- +# name: TestClickhouseTrends.test_breakdown_with_filter_groups + ' + + SELECT groupArray(value) + FROM + (SELECT trim(BOTH '"' + FROM JSONExtractRaw(properties, 'key')) AS value, + count(*) as count + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = 'sign up' + AND timestamp >= '2020-01-01 00:00:00' + AND timestamp <= '2020-01-12 23:59:59' + AND has(['finance'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) + GROUP BY value + ORDER BY count DESC + LIMIT 25 + OFFSET 0) + ' +--- +# name: TestClickhouseTrends.test_breakdown_with_filter_groups.1 + ' + + SELECT groupArray(day_start) as date, + groupArray(count) as data, + breakdown_value + FROM + (SELECT SUM(total) as count, + day_start, + breakdown_value + FROM + (SELECT * + FROM + (SELECT toUInt16(0) AS total, + ticks.day_start as day_start, + breakdown_value + FROM + (SELECT toStartOfDay(toDateTime('2020-01-12 23:59:59') - number * 86400) as day_start + FROM numbers(12) + UNION ALL SELECT toStartOfDay(toDateTime('2020-01-01 00:00:00')) as day_start) as ticks + CROSS JOIN + (SELECT breakdown_value + FROM + (SELECT ['oh', 'uh'] as breakdown_value) ARRAY + JOIN breakdown_value) as sec + ORDER BY breakdown_value, + day_start + UNION ALL SELECT count(*) as total, + toDateTime(toStartOfDay(timestamp), 'UTC') as day_start, + trim(BOTH '"' + FROM JSONExtractRaw(properties, 'key')) as breakdown_value + FROM events e + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE e.team_id = 2 + AND event = 'sign up' + AND has(['finance'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND trim(BOTH '"' + FROM JSONExtractRaw(properties, 'key')) in (['oh', 'uh']) + GROUP BY day_start, + breakdown_value)) + GROUP BY day_start, + breakdown_value + ORDER BY breakdown_value, + day_start) + GROUP BY breakdown_value + ' +--- +# name: TestClickhouseTrends.test_filtering_with_group_props + ' + + SELECT groupArray(day_start) as date, + groupArray(count) as data + FROM + (SELECT SUM(total) AS count, + day_start + from + (SELECT toUInt16(0) AS total, + toStartOfDay(toDateTime('2020-01-12 23:59:59') - toIntervalDay(number)) AS day_start + FROM numbers(dateDiff('day', toDateTime('2020-01-01 00:00:00'), toDateTime('2020-01-12 23:59:59'))) + UNION ALL SELECT toUInt16(0) AS total, + toStartOfDay(toDateTime('2020-01-01 00:00:00')) + UNION ALL SELECT count(*) as data, + toDateTime(toStartOfDay(timestamp), 'UTC') as date + FROM + (SELECT e.timestamp as timestamp, + pdi.person_id as person_id + FROM events e + INNER JOIN + (SELECT distinct_id, + argMax(person_id, _timestamp) as person_id + FROM + (SELECT distinct_id, + person_id, + max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = 2 + GROUP BY person_id, + distinct_id, + team_id + HAVING max(is_deleted) = 0) + GROUP BY distinct_id) AS pdi ON events.distinct_id = pdi.distinct_id + INNER JOIN + (SELECT id + FROM person + WHERE team_id = 2 + GROUP BY id + HAVING max(is_deleted) = 0 + AND has(['value'], trim(BOTH '"' + FROM JSONExtractRaw(argMax(person.properties, _timestamp), 'key')))) person ON person.id = pdi.person_id + INNER JOIN + (SELECT group_key, + argMax(group_properties, _timestamp) AS group_properties_0 + FROM groups + WHERE team_id = 2 + AND group_type_index = 0 + GROUP BY group_key) groups_0 ON $group_0 == groups_0.group_key + WHERE team_id = 2 + AND event = '$pageview' + AND toStartOfDay(timestamp) >= toStartOfDay(toDateTime('2020-01-01 00:00:00')) + AND timestamp <= '2020-01-12 23:59:59' + AND has(['finance'], trim(BOTH '"' + FROM JSONExtractRaw(group_properties_0, 'industry'))) ) + GROUP BY toStartOfDay(timestamp)) + group by day_start + order by day_start) + ' +--- diff --git a/ee/clickhouse/queries/test/test_breakdown_props.py b/ee/clickhouse/queries/test/test_breakdown_props.py new file mode 100644 index 0000000000000..203863686db12 --- /dev/null +++ b/ee/clickhouse/queries/test/test_breakdown_props.py @@ -0,0 +1,161 @@ +from uuid import uuid4 + +from freezegun import freeze_time + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.group import create_group +from ee.clickhouse.queries.breakdown_props import get_breakdown_prop_values +from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries +from posthog.models.cohort import Cohort +from posthog.models.entity import Entity +from posthog.models.filters import Filter +from posthog.models.group_type_mapping import GroupTypeMapping +from posthog.models.person import Person +from posthog.test.base import APIBaseTest, test_with_materialized_columns + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestBreakdownProps(ClickhouseTestMixin, APIBaseTest): + @test_with_materialized_columns(event_properties=["$host", "distinct_id"], person_properties=["$browser", "email"]) + @snapshot_clickhouse_queries + def test_breakdown_person_props(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"$browser": "test"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + self.team.test_account_filters = [ + {"key": "email", "type": "person", "value": "posthog.com", "operator": "not_icontains"}, + { + "key": "$host", + "type": "event", + "value": ["127.0.0.1:3000", "127.0.0.1:5000", "localhost:5000", "localhost:8000"], + "operator": "is_not", + }, + {"key": "distinct_id", "type": "event", "value": "posthog.com", "operator": "not_icontains"}, + ] + self.team.save() + with freeze_time("2020-01-04T13:01:01Z"): + filter = Filter( + data={ + "insight": "FUNNELS", + "properties": [], + "filter_test_accounts": True, + "events": [{"id": "$pageview", "name": "$pageview", "type": "events", "order": 0}], + "actions": [], + "funnel_viz_type": "steps", + "display": "FunnelViz", + "interval": "day", + "breakdown": "$browser", + "breakdown_type": "person", + "date_from": "-14d", + "funnel_window_days": 14, + } + ) + res = get_breakdown_prop_values( + filter, Entity({"id": "$pageview", "type": "events"}), "count(*)", self.team.pk, 5 + ) + self.assertEqual(res, ["test"]) + + def test_breakdown_person_props_with_entity_filter(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"$browser": "test"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"$browser": "test2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + cohort = Cohort.objects.create(team=self.team, name="a", groups=[{"properties": {"$browser": "test"}}]) + cohort.calculate_people() + cohort.calculate_people_ch() + + entity_params = [ + { + "id": "$pageview", + "name": "$pageview", + "type": "events", + "order": 0, + "properties": [{"key": "id", "value": cohort.pk, "type": "cohort"}], + } + ] + with self.settings(USE_PRECALCULATED_CH_COHORT_PEOPLE=True): + with freeze_time("2020-01-04T13:01:01Z"): + filter = Filter( + data={ + "insight": "FUNNELS", + "properties": [], + "filter_test_accounts": False, + "events": entity_params, + "actions": [], + "funnel_viz_type": "steps", + "display": "FunnelViz", + "interval": "day", + "breakdown": "$browser", + "breakdown_type": "person", + "date_from": "-14d", + "funnel_window_days": 14, + } + ) + res = get_breakdown_prop_values(filter, Entity(entity_params[0]), "count(*)", self.team.pk, 5) + self.assertEqual(res, ["test"]) + + @snapshot_clickhouse_queries + def test_breakdown_group_props(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:7", properties={"industry": "finance"}) + create_group( + team_id=self.team.pk, group_type_index=0, group_key="org:8", properties={"industry": "another", "out": 1} + ) + create_group( + team_id=self.team.pk, group_type_index=1, group_key="company:10", properties={"industry": "foobar"} + ) + # :TRICKY: Test group type overlapping + create_group(team_id=self.team.pk, group_type_index=1, group_key="org:8", properties={"industry": "foobar"}) + + for org_index in range(5, 9): + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": f"org:{org_index}"}, + timestamp="2020-01-02T12:00:00Z", + ) + + filter = Filter( + data={ + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "breakdown": "industry", + "breakdown_type": "group", + "breakdown_group_type_index": 0, + "events": [{"id": "$pageview", "type": "events", "order": 0,}], + "properties": [ + {"key": "out", "value": "", "type": "group", "group_type_index": 0, "operator": "is_not_set"} + ], + }, + team=self.team, + ) + result = get_breakdown_prop_values(filter, filter.entities[0], "count(*)", self.team.pk, 5) + self.assertEqual(result, ["finance", "technology"]) diff --git a/ee/clickhouse/queries/test/test_column_optimizer.py b/ee/clickhouse/queries/test/test_column_optimizer.py new file mode 100644 index 0000000000000..ee84029402917 --- /dev/null +++ b/ee/clickhouse/queries/test/test_column_optimizer.py @@ -0,0 +1,193 @@ +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.models import Action, ActionStep +from posthog.models.filters import Filter +from posthog.test.base import APIBaseTest + +PROPERTIES_OF_ALL_TYPES = [ + {"key": "event_prop", "value": ["foo", "bar"], "type": "event"}, + {"key": "person_prop", "value": "efg", "type": "person"}, + {"key": "id", "value": 1, "type": "cohort"}, + {"key": "tag_name", "value": ["label"], "operator": "exact", "type": "element"}, + {"key": "group_prop", "value": ["value"], "operator": "exact", "type": "group", "group_type_index": 2}, +] + +BASE_FILTER = Filter({"events": [{"id": "$pageview", "type": "events", "order": 0}]}) +FILTER_WITH_PROPERTIES = BASE_FILTER.with_data({"properties": PROPERTIES_OF_ALL_TYPES}) + + +class TestColumnOptimizer(ClickhouseTestMixin, APIBaseTest): + def setUp(self): + super().setUp() + self.team.test_account_filters = PROPERTIES_OF_ALL_TYPES + self.team.save() + + def test_properties_used_in_filter(self): + properties_used_in_filter = lambda filter: ColumnOptimizer(filter, self.team.id).properties_used_in_filter + + self.assertEqual(properties_used_in_filter(BASE_FILTER), {}) + self.assertEqual( + properties_used_in_filter(FILTER_WITH_PROPERTIES), + { + ("event_prop", "event", None): 1, + ("person_prop", "person", None): 1, + ("id", "cohort", None): 1, + ("tag_name", "element", None): 1, + ("group_prop", "group", 2): 1, + }, + ) + + # Breakdown cases + filter = BASE_FILTER.with_data({"breakdown": "some_prop", "breakdown_type": "person"}) + self.assertEqual(properties_used_in_filter(filter), {("some_prop", "person", None): 1}) + + filter = BASE_FILTER.with_data({"breakdown": "some_prop", "breakdown_type": "event"}) + self.assertEqual(properties_used_in_filter(filter), {("some_prop", "event", None): 1}) + + filter = BASE_FILTER.with_data({"breakdown": [11], "breakdown_type": "cohort"}) + self.assertEqual(properties_used_in_filter(filter), {}) + + filter = BASE_FILTER.with_data( + {"breakdown": "some_prop", "breakdown_type": "group", "breakdown_group_type_index": 1} + ) + self.assertEqual(properties_used_in_filter(filter), {("some_prop", "group", 1): 1}) + + # Funnel Correlation cases + filter = BASE_FILTER.with_data( + {"funnel_correlation_type": "events", "funnel_correlation_names": ["random_column"]} + ) + self.assertEqual(properties_used_in_filter(filter), {}) + + filter = BASE_FILTER.with_data( + {"funnel_correlation_type": "properties", "funnel_correlation_names": ["random_column", "$browser"]} + ) + self.assertEqual( + properties_used_in_filter(filter), {("random_column", "person", None): 1, ("$browser", "person", None): 1} + ) + + filter = BASE_FILTER.with_data({"funnel_correlation_type": "properties"}) + self.assertEqual(properties_used_in_filter(filter), {}) + + filter = Filter( + data={ + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "math": "sum", + "math_property": "numeric_prop", + "properties": PROPERTIES_OF_ALL_TYPES, + } + ] + } + ) + self.assertEqual( + properties_used_in_filter(filter), + { + ("numeric_prop", "event", None): 1, + ("event_prop", "event", None): 1, + ("person_prop", "person", None): 1, + ("id", "cohort", None): 1, + ("tag_name", "element", None): 1, + ("group_prop", "group", 2): 1, + }, + ) + + filter = Filter( + data={ + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "math": "unique_group", + "math_group_type_index": 1, + } + ] + } + ) + self.assertEqual( + properties_used_in_filter(filter), {("$group_1", "event", None): 1,}, + ) + + def test_properties_used_in_filter_with_actions(self): + action = Action.objects.create(team=self.team) + ActionStep.objects.create( + event="$autocapture", action=action, url="https://example.com/donate", url_matching=ActionStep.EXACT, + ) + ActionStep.objects.create( + action=action, + event="$autocapture", + tag_name="button", + text="Pay $10", + properties=[{"key": "$browser", "value": "Chrome", "type": "person"}], + ) + + filter = Filter(data={"actions": [{"id": action.id, "math": "dau"}]}) + self.assertEqual( + ColumnOptimizer(filter, self.team.id).properties_used_in_filter, + {("$current_url", "event", None): 1, ("$browser", "person", None): 1}, + ) + + filter = BASE_FILTER.with_data({"exclusions": [{"id": action.id, "type": "actions"}]}) + self.assertEqual( + ColumnOptimizer(filter, self.team.id).properties_used_in_filter, + {("$current_url", "event", None): 1, ("$browser", "person", None): 1}, + ) + + def test_materialized_columns_checks(self): + optimizer = lambda: ColumnOptimizer(FILTER_WITH_PROPERTIES, self.team.id) + + self.assertEqual(optimizer().event_columns_to_query, {"properties"}) + self.assertEqual(optimizer().person_columns_to_query, {"properties"}) + + materialize("events", "event_prop") + materialize("person", "person_prop") + + self.assertEqual(optimizer().event_columns_to_query, {"mat_event_prop"}) + self.assertEqual(optimizer().person_columns_to_query, {"pmat_person_prop"}) + + def test_should_query_element_chain_column(self): + should_query_elements_chain_column = lambda filter: ColumnOptimizer( + filter, self.team.id + ).should_query_elements_chain_column + + self.assertEqual(should_query_elements_chain_column(BASE_FILTER), False) + self.assertEqual(should_query_elements_chain_column(FILTER_WITH_PROPERTIES), True) + + filter = Filter( + data={"events": [{"id": "$pageview", "type": "events", "order": 0, "properties": PROPERTIES_OF_ALL_TYPES,}]} + ) + self.assertEqual(should_query_elements_chain_column(filter), True) + + def test_should_query_element_chain_column_with_actions(self): + action = Action.objects.create(team=self.team) + step1 = ActionStep.objects.create( + event="$autocapture", action=action, url="https://example.com/donate", url_matching=ActionStep.EXACT, + ) + + filter = Filter(data={"actions": [{"id": action.id, "math": "dau"}]}) + self.assertEqual( + ColumnOptimizer(filter, self.team.id).should_query_elements_chain_column, False, + ) + + ActionStep.objects.create( + action=action, event="$autocapture", tag_name="button", text="Pay $10", + ) + + self.assertEqual( + ColumnOptimizer(filter, self.team.id).should_query_elements_chain_column, True, + ) + + filter = BASE_FILTER.with_data({"exclusions": [{"id": action.id, "type": "actions"}]}) + self.assertEqual( + ColumnOptimizer(filter, self.team.id).should_query_elements_chain_column, True, + ) + + def group_types_to_query(self): + group_types_to_query = lambda filter: ColumnOptimizer(filter, self.team.id).group_types_to_query + + self.assertEqual(group_types_to_query(BASE_FILTER), set()) + self.assertEqual(group_types_to_query(FILTER_WITH_PROPERTIES), {2}) diff --git a/ee/clickhouse/queries/test/test_event_query.py b/ee/clickhouse/queries/test/test_event_query.py new file mode 100644 index 0000000000000..a8d64e150a8e3 --- /dev/null +++ b/ee/clickhouse/queries/test/test_event_query.py @@ -0,0 +1,416 @@ +from uuid import uuid4 + +from freezegun import freeze_time + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.group import create_group +from ee.clickhouse.queries.trends.trend_event_query import TrendsEventQuery +from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries +from posthog.models import Action, ActionStep +from posthog.models.cohort import Cohort +from posthog.models.element import Element +from posthog.models.entity import Entity +from posthog.models.filters import Filter +from posthog.models.group_type_mapping import GroupTypeMapping +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +def _create_cohort(**kwargs): + team = kwargs.pop("team") + name = kwargs.pop("name") + groups = kwargs.pop("groups") + is_static = kwargs.pop("is_static", False) + cohort = Cohort.objects.create(team=team, name=name, groups=groups, is_static=is_static) + return cohort + + +class TestEventQuery(ClickhouseTestMixin, APIBaseTest): + def setUp(self): + super().setUp() + self._create_sample_data() + + def _create_sample_data(self): + distinct_id = "user_one_{}".format(self.team.pk) + _create_person(distinct_ids=[distinct_id], team=self.team) + + _create_event(event="viewed", distinct_id=distinct_id, team=self.team, timestamp="2021-05-01 00:00:00") + + def _run_query(self, filter: Filter, entity=None): + entity = entity or filter.entities[0] + + query, params = TrendsEventQuery(filter=filter, entity=entity, team_id=self.team.pk).get_query() + + result = sync_execute(query, params) + + return result, query + + @snapshot_clickhouse_queries + def test_basic_event_filter(self): + self._run_query( + Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [{"id": "viewed", "order": 0},], + } + ) + ) + + def test_person_properties_filter(self): + filter = Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [{"id": "viewed", "order": 0},], + "properties": [ + {"key": "email", "value": "@posthog.com", "operator": "not_icontains", "type": "person"}, + {"key": "key", "value": "val"}, + ], + } + ) + + entity = Entity({"id": "viewed", "type": "events"}) + + self._run_query(filter, entity) + + entity = Entity( + { + "id": "viewed", + "type": "events", + "properties": [ + {"key": "email", "value": "@posthog.com", "operator": "not_icontains", "type": "person"}, + {"key": "key", "value": "val"}, + ], + } + ) + + filter = Filter( + data={"date_from": "2021-05-01 00:00:00", "date_to": "2021-05-07 00:00:00", "events": [entity.to_dict()],} + ) + + self._run_query(filter, entity) + + @snapshot_clickhouse_queries + def test_event_properties_filter(self): + filter = Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [{"id": "viewed", "order": 0},], + "properties": [{"key": "some_key", "value": "test_val", "operator": "exact", "type": "event"}], + } + ) + + entity = Entity({"id": "viewed", "type": "events"}) + + self._run_query(filter, entity) + + filter = Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [{"id": "viewed", "order": 0},], + } + ) + + entity = Entity( + { + "id": "viewed", + "type": "events", + "properties": [{"key": "some_key", "value": "test_val", "operator": "exact", "type": "event"}], + } + ) + + self._run_query(filter, entity) + + # just smoke test making sure query runs because no new functions are used here + @snapshot_clickhouse_queries + def test_cohort_filter(self): + cohort = _create_cohort(team=self.team, name="cohort1", groups=[{"properties": {"name": "test"}}]) + + filter = Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [{"id": "viewed", "order": 0},], + "properties": [{"key": "id", "value": cohort.pk, "type": "cohort"}], + } + ) + + self._run_query(filter) + + # just smoke test making sure query runs because no new functions are used here + @snapshot_clickhouse_queries + def test_entity_filtered_by_cohort(self): + cohort = _create_cohort(team=self.team, name="cohort1", groups=[{"properties": {"name": "test"}}]) + + filter = Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [ + { + "id": "$pageview", + "order": 0, + "properties": [{"key": "id", "type": "cohort", "value": cohort.pk}], + }, + ], + } + ) + + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "test"}) + _create_event(team=self.team, event="$pageview", distinct_id="p1", timestamp="2020-01-02T12:00:00Z") + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"name": "foo"}) + _create_event(team=self.team, event="$pageview", distinct_id="p2", timestamp="2020-01-02T12:01:00Z") + + self._run_query(filter) + + # smoke test make sure query is formatted and runs + @snapshot_clickhouse_queries + def test_static_cohort_filter(self): + cohort = _create_cohort(team=self.team, name="cohort1", groups=[], is_static=True) + + filter = Filter( + data={ + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "events": [{"id": "viewed", "order": 0},], + "properties": [{"key": "id", "value": cohort.pk, "type": "cohort"}], + }, + team=self.team, + ) + + self._run_query(filter) + + @snapshot_clickhouse_queries + @freeze_time("2021-01-21") + def test_account_filters(self): + person1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_1"], properties={"name": "John"}) + person2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_2"], properties={"name": "Jane"}) + + _create_event(event="event_name", team=self.team, distinct_id="person_1") + _create_event(event="event_name", team=self.team, distinct_id="person_2") + _create_event(event="event_name", team=self.team, distinct_id="person_2") + + cohort = Cohort.objects.create(team=self.team, name="cohort1", groups=[{"properties": {"name": "Jane"}}]) + cohort.calculate_people() + + self.team.test_account_filters = [{"key": "id", "value": cohort.pk, "type": "cohort"}] + self.team.save() + + filter = Filter( + data={"events": [{"id": "event_name", "order": 0},], "filter_test_accounts": True}, team=self.team + ) + + self._run_query(filter) + + def test_action_with_person_property_filter(self): + person1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_1"], properties={"name": "John"}) + person2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_2"], properties={"name": "Jane"}) + + _create_event(event="event_name", team=self.team, distinct_id="person_1") + _create_event(event="event_name", team=self.team, distinct_id="person_2") + _create_event(event="event_name", team=self.team, distinct_id="person_2") + + action = Action.objects.create(team=self.team, name="action1") + ActionStep.objects.create( + event="event_name", action=action, properties=[{"key": "name", "type": "person", "value": "John"}], + ) + + filter = Filter(data={"actions": [{"id": action.id, "type": "actions", "order": 0},]}) + + self._run_query(filter) + + @snapshot_clickhouse_queries + def test_denormalised_props(self): + filters = { + "events": [ + { + "id": "user signed up", + "type": "events", + "order": 0, + "properties": [{"key": "test_prop", "value": "hi"}], + }, + ], + "date_from": "2020-01-01", + "properties": [{"key": "test_prop", "value": "hi"}], + "date_to": "2020-01-14", + } + + materialize("events", "test_prop") + + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"key": "value"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"test_prop": "hi"}, + ) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"key_2": "value_2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-02T12:00:00Z", + properties={"test_prop": "hi"}, + ) + + filter = Filter(data=filters) + _, query = self._run_query(filter) + self.assertIn("mat_test_prop", query) + + @snapshot_clickhouse_queries + @freeze_time("2021-01-21") + def test_element(self): + _create_event( + event="$autocapture", + team=self.team, + distinct_id="whatever", + properties={"attr": "some_other_val"}, + elements=[ + Element( + tag_name="a", + href="/a-url", + attr_class=["small"], + text="bla bla", + attributes={}, + nth_child=1, + nth_of_type=0, + ), + Element(tag_name="button", attr_class=["btn", "btn-primary"], nth_child=0, nth_of_type=0), + Element(tag_name="div", nth_child=0, nth_of_type=0), + Element(tag_name="label", nth_child=0, nth_of_type=0, attr_id="nested",), + ], + ) + _create_event( + event="$pageview", + team=self.team, + distinct_id="whatever", + properties={"attr": "some_val"}, + elements=[ + Element( + tag_name="a", + href="/a-url", + attr_class=["small"], + text="bla bla", + attributes={}, + nth_child=1, + nth_of_type=0, + ), + Element(tag_name="button", attr_class=["btn", "btn-secondary"], nth_child=0, nth_of_type=0), + Element(tag_name="div", nth_child=0, nth_of_type=0), + Element(tag_name="img", nth_child=0, nth_of_type=0, attr_id="nested",), + ], + ) + + filter = Filter( + data={ + "events": [{"id": "event_name", "order": 0},], + "properties": [{"key": "tag_name", "value": ["label"], "operator": "exact", "type": "element"}], + } + ) + + self._run_query(filter) + + self._run_query( + filter.with_data( + {"properties": [{"key": "tag_name", "value": [], "operator": "exact", "type": "element"}],} + ) + ) + + def _create_groups_test_data(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"}) + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:1", properties={"another": "value"}) + + Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"$browser": "test"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"$browser": "foobar"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["p3"], properties={"$browser": "test"}) + + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"$group_0": "org:5", "$group_1": "company:1"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-02T12:00:00Z", + properties={"$group_0": "org:6", "$group_1": "company:1"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"$group_0": "org:6"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p3", + timestamp="2020-01-02T12:00:00Z", + properties={"$group_0": "org:5"}, + ) + + @snapshot_clickhouse_queries + def test_groups_filters(self): + self._create_groups_test_data() + + filter = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [{"id": "$pageview", "type": "events", "order": 0}], + "properties": [ + {"key": "industry", "value": "finance", "type": "group", "group_type_index": 0}, + {"key": "another", "value": "value", "type": "group", "group_type_index": 1}, + ], + }, + team=self.team, + ) + + results, _ = self._run_query(filter) + self.assertEqual(len(results), 1) + + @snapshot_clickhouse_queries + def test_groups_filters_mixed(self): + self._create_groups_test_data() + + filter = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [{"id": "$pageview", "type": "events", "order": 0}], + "properties": [ + {"key": "industry", "value": "finance", "type": "group", "group_type_index": 0}, + {"key": "$browser", "value": "test", "type": "person"}, + ], + }, + team=self.team, + ) + + results, _ = self._run_query(filter) + self.assertEqual(len(results), 2) diff --git a/ee/clickhouse/queries/test/test_funnel.py b/ee/clickhouse/queries/test/test_funnel.py deleted file mode 100644 index 8a9f40a8e3e12..0000000000000 --- a/ee/clickhouse/queries/test/test_funnel.py +++ /dev/null @@ -1,21 +0,0 @@ -from uuid import uuid4 - -from ee.clickhouse.models.event import create_event -from ee.clickhouse.queries.clickhouse_funnel import ClickhouseFunnel -from ee.clickhouse.util import ClickhouseTestMixin -from posthog.models.person import Person -from posthog.queries.test.test_funnel import funnel_test_factory - - -def _create_person(**kwargs): - person = Person.objects.create(**kwargs) - return Person(id=person.uuid, uuid=person.uuid) - - -def _create_event(**kwargs): - kwargs.update({"event_uuid": uuid4()}) - create_event(**kwargs) - - -class TestClickhouseFunnel(ClickhouseTestMixin, funnel_test_factory(ClickhouseFunnel, _create_event, _create_person)): # type: ignore - pass diff --git a/ee/clickhouse/queries/test/test_groups_join_query.py b/ee/clickhouse/queries/test/test_groups_join_query.py new file mode 100644 index 0000000000000..039c368f53293 --- /dev/null +++ b/ee/clickhouse/queries/test/test_groups_join_query.py @@ -0,0 +1,18 @@ +import pytest + +from ee.clickhouse.queries.groups_join_query import GroupsJoinQuery +from posthog.models.filters import Filter + + +def test_groups_join_query_blank(): + filter = Filter(data={"properties": []}) + + assert GroupsJoinQuery(filter, 2).get_join_query() == ("", {}) + + +def test_groups_join_query_filtering(snapshot): + filter = Filter( + data={"properties": [{"key": "industry", "value": "finance", "type": "group", "group_type_index": 0}]} + ) + + assert GroupsJoinQuery(filter, 2).get_join_query() == snapshot diff --git a/ee/clickhouse/queries/test/test_lifecycle.py b/ee/clickhouse/queries/test/test_lifecycle.py new file mode 100644 index 0000000000000..bea99748ba5b3 --- /dev/null +++ b/ee/clickhouse/queries/test/test_lifecycle.py @@ -0,0 +1,26 @@ +from uuid import uuid4 + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.trends.clickhouse_trends import ClickhouseTrends +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.models.action import Action +from posthog.models.action_step import ActionStep +from posthog.models.person import Person +from posthog.queries.test.test_lifecycle import lifecycle_test_factory + + +def _create_action(**kwargs): + team = kwargs.pop("team") + name = kwargs.pop("name") + action = Action.objects.create(team=team, name=name) + ActionStep.objects.create(action=action, event=name) + return action + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestClickhouseLifecycle(ClickhouseTestMixin, lifecycle_test_factory(ClickhouseTrends, _create_event, Person.objects.create, _create_action)): # type: ignore + pass diff --git a/ee/clickhouse/queries/test/test_paths.py b/ee/clickhouse/queries/test/test_paths.py index c48cb908c2c12..b159051eb0147 100644 --- a/ee/clickhouse/queries/test/test_paths.py +++ b/ee/clickhouse/queries/test/test_paths.py @@ -1,16 +1,2603 @@ +import json +from typing import Any, Dict, List, Tuple +from unittest.mock import MagicMock from uuid import uuid4 +from django.test import TestCase +from django.utils import timezone +from freezegun import freeze_time + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns.columns import materialize from ee.clickhouse.models.event import create_event -from ee.clickhouse.queries.clickhouse_paths import ClickhousePaths +from ee.clickhouse.queries import ClickhousePaths +from ee.clickhouse.queries.paths import ClickhousePathsPersons +from ee.clickhouse.queries.paths.path_event_query import PathEventQuery from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import ( + FUNNEL_PATH_AFTER_STEP, + FUNNEL_PATH_BEFORE_STEP, + FUNNEL_PATH_BETWEEN_STEPS, + INSIGHT_FUNNELS, + PAGEVIEW_EVENT, + SCREEN_EVENT, +) +from posthog.models.filters import Filter, PathFilter from posthog.models.person import Person -from posthog.queries.test.test_paths import paths_test_factory +from posthog.models.team import Team +from posthog.queries.test.test_paths import MockEvent, paths_test_factory +from posthog.test.base import test_with_materialized_columns + + +def _create_event(**event): + return {**event} + + +ONE_MINUTE = 60_000 # 1 minute in milliseconds + + +def _create_all_events(all_events: List[Dict]): + parsed = "" + for event in all_events: + data: Dict[str, Any] = {"properties": {}, "timestamp": timezone.now().strftime("%Y-%m-%d %H:%M:%S.%f")} + data.update(event) + mocked_event = MockEvent(**data) + parsed += f""" + ('{str(uuid4())}', '{mocked_event.event}', '{json.dumps(mocked_event.properties)}', '{mocked_event.timestamp}', {mocked_event.team.pk}, '{mocked_event.distinct_id}', '', '{timezone.now().strftime("%Y-%m-%d %H:%M:%S.%f")}', now(), 0) + """ + + sync_execute( + f""" + INSERT INTO events (uuid, event, properties, timestamp, team_id, distinct_id, elements_chain, created_at, _timestamp, _offset) VALUES + {parsed} + """ + ) + + +class TestClickhousePaths(ClickhouseTestMixin, paths_test_factory(ClickhousePaths, _create_event, Person.objects.create, _create_all_events)): # type: ignore + + maxDiff = None + + def _get_people_at_path(self, filter, path_start=None, path_end=None, funnel_filter=None, path_dropoff=None): + person_filter = filter.with_data( + {"path_start_key": path_start, "path_end_key": path_end, "path_dropoff_key": path_dropoff} + ) + result = ClickhousePathsPersons(person_filter, self.team, funnel_filter)._exec_query() + return [row[0] for row in result] + + def test_denormalized_properties(self): + materialize("events", "$current_url") + materialize("events", "$screen_name") + + query = ClickhousePaths(team=self.team, filter=PathFilter(data={"path_type": PAGEVIEW_EVENT})).get_query() + self.assertNotIn("json", query.lower()) + + query = ClickhousePaths(team=self.team, filter=PathFilter(data={"path_type": SCREEN_EVENT})).get_query() + self.assertNotIn("json", query.lower()) + + self.test_current_url_paths_and_logic() + + def test_step_limit(self): + + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["fake"]) + events = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:27:34", + ), + ] + + _create_all_events(events) + + with freeze_time("2012-01-7T03:21:34.000Z"): + filter = PathFilter(data={"step_limit": 2}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, [{"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}] + ) + self.assertEqual([p1.uuid], self._get_people_at_path(filter, "1_/1", "2_/2")) + self.assertEqual([], self._get_people_at_path(filter, "2_/2", "3_/3")) + + with freeze_time("2012-01-7T03:21:34.000Z"): + filter = PathFilter(data={"step_limit": 3}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + self.assertEqual([p1.uuid], self._get_people_at_path(filter, "2_/2", "3_/3")) + + with freeze_time("2012-01-7T03:21:34.000Z"): + filter = PathFilter(data={"step_limit": 4}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "3_/3", "target": "4_/4", "value": 1, "average_conversion_time": 3 * ONE_MINUTE}, + ], + ) + self.assertEqual([p1.uuid], self._get_people_at_path(filter, "1_/1", "2_/2")) + self.assertEqual([p1.uuid], self._get_people_at_path(filter, "2_/2", "3_/3")) + self.assertEqual([p1.uuid], self._get_people_at_path(filter, "3_/3", "4_/4")) + + def test_step_conversion_times(self): + + Person.objects.create(team_id=self.team.pk, distinct_ids=["fake"]) + p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:27:34", + ), + ] + + Person.objects.create(team_id=self.team.pk, distinct_ids=["fake2"]) + p2 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="fake2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:23:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:27:34", + ), + ] + + _create_all_events([*p1, *p2]) + + filter = PathFilter(data={"step_limit": 4, "date_from": "2012-01-01", "include_event_types": ["$pageview"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 2, "average_conversion_time": 1.5 * ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 2, "average_conversion_time": 3 * ONE_MINUTE}, + {"source": "3_/3", "target": "4_/4", "value": 1, "average_conversion_time": 3 * ONE_MINUTE}, + ], + ) + + # this tests to make sure that paths don't get scrambled when there are several similar variations + def test_path_event_ordering(self): + events = [] + for i in range(50): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + person_events = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:01:00", + properties={}, + ), + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:02:00", + properties={}, + ), + ] + events.extend(person_events) + + if i % 2 == 0: + events.append( + _create_event( + event="step branch", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:03:00", + properties={}, + ) + ) + + _create_all_events(events) + + filter = PathFilter( + data={"date_from": "2021-05-01", "date_to": "2021-05-03", "include_event_types": ["custom_event"]} + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + self.assertEqual( + response, + [ + {"source": "1_step one", "target": "2_step two", "value": 50, "average_conversion_time": 60000.0}, + {"source": "2_step two", "target": "3_step three", "value": 50, "average_conversion_time": 60000.0}, + {"source": "3_step three", "target": "4_step branch", "value": 25, "average_conversion_time": 60000.0}, + ], + ) + + def _create_sample_data_multiple_dropoffs(self): + events = [] + for i in range(5): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + full_funnel = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="between_step_1_a", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:01:00", + properties={}, + ), + _create_event( + event="between_step_1_b", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:02:00", + properties={}, + ), + _create_event( + event="between_step_1_c", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:03:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:04:00", + properties={}, + ), + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:05:00", + properties={}, + ), + ] + events.extend(full_funnel) + + for i in range(5, 15): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + two_step_funnel = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="between_step_1_a", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:01:00", + properties={}, + ), + _create_event( + event="between_step_1_b", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:02:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:03:00", + properties={}, + ), + _create_event( + event="between_step_2_a", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:04:20", + properties={}, + ), + _create_event( + event="between_step_2_b", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:05:40", + properties={}, + ), + ] + events.extend(two_step_funnel) + + for i in range(15, 35): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + funnel_branching = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="step dropoff1", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:01:00", + properties={}, + ), + _create_event( + event="step dropoff2", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:02:00", + properties={}, + ), + ] + if i % 2 == 0: + funnel_branching.append( + _create_event( + event="step branch", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:03:00", + properties={}, + ) + ) + events.extend(funnel_branching) + + _create_all_events(events) + + def test_path_by_grouping(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "path_groupings": ["between_step_1_*", "between_step_2_*", "step drop*"], + } + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertCountEqual( + response, + [ + { + "source": "1_step one", + "target": "2_step drop*", + "value": 20, + "average_conversion_time": 2 * ONE_MINUTE, + }, + # when we group events for a single user, these effectively become duplicate events, and we choose the last event from + # a list of duplicate events. + { + "source": "1_step one", + "target": "2_between_step_1_*", + "value": 15, + "average_conversion_time": (5 * 3 + 10 * 2) + * ONE_MINUTE + / 15, # first 5 go till between_step_1_c, next 10 go till between_step_1_b + }, + { + "source": "2_between_step_1_*", + "target": "3_step two", + "value": 15, + "average_conversion_time": ONE_MINUTE, + }, + { + "source": "2_step drop*", + "target": "3_step branch", + "value": 10, + "average_conversion_time": ONE_MINUTE, + }, + { + "source": "3_step two", + "target": "4_between_step_2_*", + "value": 10, + "average_conversion_time": 160000, + }, + {"source": "3_step two", "target": "4_step three", "value": 5, "average_conversion_time": ONE_MINUTE,}, + ], + ) + + def test_path_by_grouping_replacement(self): + + Person.objects.create(distinct_ids=[f"user_1"], team=self.team) + Person.objects.create(distinct_ids=[f"user_2"], team=self.team) + Person.objects.create(distinct_ids=[f"user_3"], team=self.team) + events = [ + { + "event": "$pageview", + "distinct_id": f"user_1", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {"$current_url": "test.com/step1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_1", + "team": self.team, + "timestamp": "2021-05-01 00:01:00", + "properties": {"$current_url": "test.com/step2"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_1", + "team": self.team, + "timestamp": "2021-05-01 00:02:00", + "properties": {"$current_url": "test.com/step3?key=value1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_2", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {"$current_url": "test.com/step1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_2", + "team": self.team, + "timestamp": "2021-05-01 00:01:00", + "properties": {"$current_url": "test.com/step2"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_2", + "team": self.team, + "timestamp": "2021-05-01 00:02:00", + "properties": {"$current_url": "test.com/step3?key=value2"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_3", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {"$current_url": "test.com/step1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_3", + "team": self.team, + "timestamp": "2021-05-01 00:01:00", + "properties": {"$current_url": "test.com/step2"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_3", + "team": self.team, + "timestamp": "2021-05-01 00:02:00", + "properties": {"$current_url": "test.com/step3?key=value3"}, + }, + ] + _create_all_events(events) + + self.team.path_cleaning_filters = [{"alias": "?", "regex": "\\?(.*)"}] + self.team.save() + + data = { + "insight": INSIGHT_FUNNELS, + "include_event_types": ["$pageview"], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + } + path_filter = PathFilter(data=data) + response_no_flag = ClickhousePaths(team=self.team, filter=path_filter).run() + + self.assertNotEqual( + response_no_flag, + [ + { + "source": "1_test.com/step1", + "target": "2_test.com/step2", + "value": 3, + "average_conversion_time": 60000.0, + }, + { + "source": "2_test.com/step2", + "target": "3_test.com/step3?", + "value": 3, + "average_conversion_time": 60000.0, + }, + ], + ) + + data.update({"path_replacements": "true"}) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + + self.assertEqual( + response, + [ + { + "source": "1_test.com/step1", + "target": "2_test.com/step2", + "value": 3, + "average_conversion_time": 60000.0, + }, + { + "source": "2_test.com/step2", + "target": "3_test.com/step3?", + "value": 3, + "average_conversion_time": 60000.0, + }, + ], + ) + + def test_path_by_grouping_replacement_multiple(self): + events = [ + { + "event": "$pageview", + "distinct_id": f"user_1", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {"$current_url": "test.com/step1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_1", + "team": self.team, + "timestamp": "2021-05-01 00:01:00", + "properties": {"$current_url": "test.com/step2/5"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_1", + "team": self.team, + "timestamp": "2021-05-01 00:02:00", + "properties": {"$current_url": "test.com/step2/5?key=value1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_2", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {"$current_url": "test.com/step1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_2", + "team": self.team, + "timestamp": "2021-05-01 00:01:00", + "properties": {"$current_url": "test.com/step2/5"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_2", + "team": self.team, + "timestamp": "2021-05-01 00:02:00", + "properties": {"$current_url": "test.com/step2/5?key=value2"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_3", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {"$current_url": "test.com/step1"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_3", + "team": self.team, + "timestamp": "2021-05-01 00:01:00", + "properties": {"$current_url": "test.com/step2/5"}, + }, + { + "event": "$pageview", + "distinct_id": f"user_3", + "team": self.team, + "timestamp": "2021-05-01 00:02:00", + "properties": {"$current_url": "test.com/step2/5?key=value3"}, + }, + ] + _create_all_events(events) + + Person.objects.create(distinct_ids=[f"user_1"], team=self.team) + + Person.objects.create(distinct_ids=[f"user_2"], team=self.team) + + Person.objects.create(distinct_ids=[f"user_3"], team=self.team) + + correct_response = [ + { + "source": "1_test.com/step1", + "target": "2_test.com/step2/", + "value": 3, + "average_conversion_time": 60000.0, + }, + { + "source": "2_test.com/step2/", + "target": "3_test.com/step2/", + "value": 3, + "average_conversion_time": 60000.0, + }, + ] + + self.team.path_cleaning_filters = [ + {"alias": "?", "regex": "\\?(.*)"}, + {"alias": "/", "regex": "/\\d+(/|\\?)?"}, + ] + self.team.save() + + data = { + "insight": INSIGHT_FUNNELS, + "include_event_types": ["$pageview"], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "path_replacements": True, + } + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertEqual( + response, correct_response, + ) + + self.team.path_cleaning_filters = [ + {"alias": "?", "regex": "\\?(.*)"}, + ] + self.team.save() + + data.update({"local_path_cleaning_filters": [{"alias": "/", "regex": "/\\d+(/|\\?)?"}]}) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertEqual( + response, correct_response, + ) + + # overriding team filters + data.update( + { + "path_replacements": False, + "local_path_cleaning_filters": [ + {"alias": "?", "regex": "\\?(.*)"}, + {"alias": "/", "regex": "/\\d+(/|\\?)?"}, + ], + } + ) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertEqual( + response, correct_response, + ) + + def test_path_by_funnel_after_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": -2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + {"source": "1_step one", "target": "2_step dropoff1", "value": 20, "average_conversion_time": 60000.0}, + { + "source": "2_step dropoff1", + "target": "3_step dropoff2", + "value": 20, + "average_conversion_time": 60000.0, + }, + { + "source": "3_step dropoff2", + "target": "4_step branch", + "value": 10, + "average_conversion_time": 60000.0, + }, + ], + ) + self.assertEqual(20, len(self._get_people_at_path(path_filter, "1_step one", "2_step dropoff1", funnel_filter))) + self.assertEqual( + 20, len(self._get_people_at_path(path_filter, "2_step dropoff1", "3_step dropoff2", funnel_filter)) + ) + self.assertEqual( + 10, len(self._get_people_at_path(path_filter, "3_step dropoff2", "4_step branch", funnel_filter)) + ) + self.assertEqual( + 0, len(self._get_people_at_path(path_filter, "4_step branch", "3_step dropoff2", funnel_filter)) + ) + + def test_path_by_funnel_after_step_respects_conversion_window(self): + # note events happen after 1 day + events = [] + for i in range(5): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + events.extend( + [ + { + "event": "step one", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {}, + }, + { + "event": "between_step_1_a", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-02 00:00:00", + "properties": {}, + }, + { + "event": "between_step_1_b", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-03 00:00:00", + "properties": {}, + }, + { + "event": "between_step_1_c", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-04 00:00:00", + "properties": {}, + }, + { + "event": "step two", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-05 00:00:00", + "properties": {}, + }, + { + "event": "step three", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-06 00:00:00", + "properties": {}, + }, + ] + ) + for i in range(15, 35): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + events.extend( + [ + { + "event": "step one", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-01 00:00:00", + "properties": {}, + }, + { + "event": "step dropoff1", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-02 00:00:00", + "properties": {}, + }, + { + "event": "step dropoff2", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-03 00:00:00", + "properties": {}, + }, + ] + ) + if i % 2 == 0: + events.extend( + [ + { + "event": "step branch", + "distinct_id": f"user_{i}", + "team": self.team, + "timestamp": "2021-05-04 00:00:00", + "properties": {}, + } + ] + ) + + _create_all_events(events) + + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": -2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + { + "source": "1_step one", + "target": "2_step dropoff1", + "value": 20, + "average_conversion_time": ONE_MINUTE * 60 * 24, + }, + { + "source": "2_step dropoff1", + "target": "3_step dropoff2", + "value": 20, + "average_conversion_time": ONE_MINUTE * 60 * 24, + }, + { + "source": "3_step dropoff2", + "target": "4_step branch", + "value": 10, + "average_conversion_time": ONE_MINUTE * 60 * 24, + }, + ], + ) + self.assertEqual(20, len(self._get_people_at_path(path_filter, "1_step one", "2_step dropoff1", funnel_filter))) + self.assertEqual( + 20, len(self._get_people_at_path(path_filter, "2_step dropoff1", "3_step dropoff2", funnel_filter)) + ) + self.assertEqual( + 10, len(self._get_people_at_path(path_filter, "3_step dropoff2", "4_step branch", funnel_filter)) + ) + self.assertEqual( + 0, len(self._get_people_at_path(path_filter, "4_step branch", "3_step dropoff2", funnel_filter)) + ) + + def test_path_by_funnel_after_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + { + "source": "1_step two", + "target": "2_between_step_2_a", + "value": 10, + "average_conversion_time": 80000.0, + }, + { + "source": "2_between_step_2_a", + "target": "3_between_step_2_b", + "value": 10, + "average_conversion_time": 80000.0, + }, + {"source": "1_step two", "target": "2_step three", "value": 5, "average_conversion_time": 60000.0}, + ], + ) + + def test_path_by_funnel_after_step_limit(self): + self._create_sample_data_multiple_dropoffs() + events = [] + # add more than 100. Previously, the funnel limit at 100 was stopping all users from showing up + for i in range(100, 200): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + person_events = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="between_step_1_a", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:01:00", + properties={}, + ), + _create_event( + event="between_step_1_b", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:02:00", + properties={}, + ), + _create_event( + event="between_step_1_c", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:03:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:04:00", + properties={}, + ), + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:05:00", + properties={}, + ), + ] + events.extend(person_events) + _create_all_events(events) + + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + {"source": "1_step two", "target": "2_step three", "value": 105, "average_conversion_time": 60000.0}, + { + "source": "1_step two", + "target": "2_between_step_2_a", + "value": 10, + "average_conversion_time": 80000.0, + }, + { + "source": "2_between_step_2_a", + "target": "3_between_step_2_b", + "value": 10, + "average_conversion_time": 80000.0, + }, + ], + ) + + def test_path_by_funnel_before_dropoff(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_BEFORE_STEP, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": -3, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + { + "source": "1_step one", + "target": "2_between_step_1_a", + "value": 10, + "average_conversion_time": 60000.0, + }, + { + "source": "2_between_step_1_a", + "target": "3_between_step_1_b", + "value": 10, + "average_conversion_time": 60000.0, + }, + { + "source": "3_between_step_1_b", + "target": "4_step two", + "value": 10, + "average_conversion_time": 60000.0, + }, + { + "source": "4_step two", + "target": "5_between_step_2_a", + "value": 10, + "average_conversion_time": 80000.0, + }, + ], + ) + + def test_path_by_funnel_before_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_BEFORE_STEP, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + { + "source": "1_step one", + "target": "2_between_step_1_a", + "value": 15, + "average_conversion_time": 60000.0, + }, + { + "source": "2_between_step_1_a", + "target": "3_between_step_1_b", + "value": 15, + "average_conversion_time": 60000.0, + }, + { + "source": "3_between_step_1_b", + "target": "4_step two", + "value": 10, + "average_conversion_time": 60000.0, + }, + { + "source": "3_between_step_1_b", + "target": "4_between_step_1_c", + "value": 5, + "average_conversion_time": 60000.0, + }, + { + "source": "4_between_step_1_c", + "target": "5_step two", + "value": 5, + "average_conversion_time": 60000.0, + }, + ], + ) + + def test_path_by_funnel_between_step(self): + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "funnel_paths": FUNNEL_PATH_BETWEEN_STEPS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + funnel_filter = Filter(data=data) + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter, funnel_filter=funnel_filter).run() + self.assertEqual( + response, + [ + { + "source": "1_step one", + "target": "2_between_step_1_a", + "value": 15, + "average_conversion_time": 60000.0, + }, + { + "source": "2_between_step_1_a", + "target": "3_between_step_1_b", + "value": 15, + "average_conversion_time": 60000.0, + }, + { + "source": "3_between_step_1_b", + "target": "4_step two", + "value": 10, + "average_conversion_time": 60000.0, + }, + { + "source": "3_between_step_1_b", + "target": "4_between_step_1_c", + "value": 5, + "average_conversion_time": 60000.0, + }, + { + "source": "4_between_step_1_c", + "target": "5_step two", + "value": 5, + "average_conversion_time": 60000.0, + }, + ], + ) + self.assertEqual( + 15, len(self._get_people_at_path(path_filter, "1_step one", "2_between_step_1_a", funnel_filter)) + ) + self.assertEqual( + 15, len(self._get_people_at_path(path_filter, "2_between_step_1_a", "3_between_step_1_b", funnel_filter)) + ) + self.assertEqual( + 10, len(self._get_people_at_path(path_filter, "3_between_step_1_b", "4_step two", funnel_filter)) + ) + self.assertEqual( + 5, len(self._get_people_at_path(path_filter, "3_between_step_1_b", "4_between_step_1_c", funnel_filter)) + ) + self.assertEqual( + 5, len(self._get_people_at_path(path_filter, "4_between_step_1_c", "5_step two", funnel_filter)) + ) + + @test_with_materialized_columns(["$current_url"]) + def test_paths_end(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["person_1"]) + p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:04:00", + ), + _create_event( + properties={"$current_url": "/5"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:05:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:06:00", + ), + _create_event( + properties={"$current_url": "/after"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:07:00", + ), + ] + + Person.objects.create(team_id=self.team.pk, distinct_ids=["person_2"]) + p2 = [ + _create_event( + properties={"$current_url": "/5"}, + distinct_id="person_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + ] + + Person.objects.create(team_id=self.team.pk, distinct_ids=["person_3"]) + p3 = [ + _create_event( + properties={"$current_url": "/3"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + _create_event( + properties={"$current_url": "/after"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:04:00", + ), + ] + + events = [*p1, *p2, *p3] + _create_all_events(events) + + filter = PathFilter( + data={ + "path_type": "$pageview", + "end_point": "/about", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter,) + self.assertEqual( + response, + [ + {"source": "1_/2", "target": "2_/3", "value": 1, "average_conversion_time": 60000.0}, + {"source": "1_/3", "target": "2_/4", "value": 1, "average_conversion_time": 60000.0}, + {"source": "1_/5", "target": "2_/about", "value": 1, "average_conversion_time": 60000.0}, + {"source": "2_/3", "target": "3_/4", "value": 1, "average_conversion_time": 60000.0}, + {"source": "2_/4", "target": "3_/about", "value": 1, "average_conversion_time": 60000.0}, + {"source": "3_/4", "target": "4_/5", "value": 1, "average_conversion_time": 60000.0}, + {"source": "4_/5", "target": "5_/about", "value": 1, "average_conversion_time": 60000.0}, + ], + ) + + def test_event_inclusion_exclusion_filters(self): + + # P1 for pageview event + Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"]) + p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2/"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P2 for screen event + Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"]) + p2 = [ + _create_event( + properties={"$screen_name": "/screen1"}, + distinct_id="p2", + event="$screen", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$screen_name": "/screen2"}, + distinct_id="p2", + event="$screen", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$screen_name": "/screen3"}, + distinct_id="p2", + event="$screen", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P3 for custom event + Person.objects.create(team_id=self.team.pk, distinct_ids=["p3"]) + p3 = [ + _create_event(distinct_id="p3", event="/custom1", team=self.team, timestamp="2012-01-01 03:21:34",), + _create_event(distinct_id="p3", event="/custom2", team=self.team, timestamp="2012-01-01 03:22:34",), + _create_event(distinct_id="p3", event="/custom3", team=self.team, timestamp="2012-01-01 03:24:34",), + ] + + events = [*p1, *p2, *p3] + _create_all_events(events) + + filter = PathFilter(data={"step_limit": 4, "date_from": "2012-01-01", "include_event_types": ["$pageview"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data({"include_event_types": ["$screen"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/screen1", "target": "2_/screen2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/screen2", "target": "3_/screen3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data({"include_event_types": ["custom_event"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/custom1", "target": "2_/custom2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/custom2", "target": "3_/custom3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data({"include_event_types": [], "include_custom_events": ["/custom1", "/custom2"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [{"source": "1_/custom1", "target": "2_/custom2", "value": 1, "average_conversion_time": ONE_MINUTE},], + ) + + filter = filter.with_data({"include_event_types": [], "include_custom_events": ["/custom3", "blah"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, [], + ) + + filter = filter.with_data( + {"include_event_types": ["$pageview", "$screen", "custom_event"], "include_custom_events": []} + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "1_/custom1", "target": "2_/custom2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "1_/screen1", "target": "2_/screen2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "2_/custom2", "target": "3_/custom3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "2_/screen2", "target": "3_/screen3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data( + { + "include_event_types": ["$pageview", "$screen", "custom_event"], + "include_custom_events": [], + "exclude_events": ["/custom1", "/1", "/2", "/3"], + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + self.assertEqual( + response, + [ + {"source": "1_/custom2", "target": "2_/custom3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "1_/screen1", "target": "2_/screen2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/screen2", "target": "3_/screen3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + def test_event_exclusion_filters_with_wildcards(self): + + # P1 for pageview event /2/bar/1/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"]) + p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2/bar/1/foo"}, # regex matches, despite beginning with `/2/` + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P2 for pageview event /bar/2/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"]) + p2 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/bar/2/foo"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P3 for pageview event /bar/3/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p3"]) + p3 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p3", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/bar/33/foo"}, + distinct_id="p3", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p3", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + events = [*p1, *p2, *p3] + _create_all_events(events) + + filter = PathFilter( + data={ + "step_limit": 4, + "date_from": "2012-01-01", + "exclude_events": ["/bar/*/foo"], + "include_event_types": ["$pageview"], + "path_groupings": ["/bar/*/foo"], + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, [{"source": "1_/1", "target": "2_/3", "value": 3, "average_conversion_time": 3 * ONE_MINUTE},], + ) + + filter = filter.with_data({"path_groupings": ["/xxx/invalid/*"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual(len(response), 6) + + def test_event_inclusion_exclusion_filters_across_single_person(self): + + # P1 for pageview event, screen event, and custom event all together + Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"]) + events = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + _create_event( + properties={"$screen_name": "/screen1"}, + distinct_id="p1", + event="$screen", + team=self.team, + timestamp="2012-01-01 03:25:34", + ), + _create_event( + properties={"$screen_name": "/screen2"}, + distinct_id="p1", + event="$screen", + team=self.team, + timestamp="2012-01-01 03:26:34", + ), + _create_event( + properties={"$screen_name": "/screen3"}, + distinct_id="p1", + event="$screen", + team=self.team, + timestamp="2012-01-01 03:28:34", + ), + _create_event(distinct_id="p1", event="/custom1", team=self.team, timestamp="2012-01-01 03:29:34",), + _create_event(distinct_id="p1", event="/custom2", team=self.team, timestamp="2012-01-01 03:30:34",), + _create_event(distinct_id="p1", event="/custom3", team=self.team, timestamp="2012-01-01 03:32:34",), + ] + + _create_all_events(events) + + filter = PathFilter(data={"step_limit": 10, "date_from": "2012-01-01"}) # include everything, exclude nothing + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "3_/3", "target": "4_/screen1", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "4_/screen1", "target": "5_/screen2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "5_/screen2", "target": "6_/screen3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "6_/screen3", "target": "7_/custom1", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "7_/custom1", "target": "8_/custom2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "8_/custom2", "target": "9_/custom3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data({"include_event_types": ["$pageview", "$screen"]}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "3_/3", "target": "4_/screen1", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "4_/screen1", "target": "5_/screen2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "5_/screen2", "target": "6_/screen3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data( + {"include_event_types": ["$pageview", "$screen"], "include_custom_events": ["/custom2"]} + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "3_/3", "target": "4_/screen1", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "4_/screen1", "target": "5_/screen2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "5_/screen2", "target": "6_/screen3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "6_/screen3", "target": "7_/custom2", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + filter = filter.with_data( + { + "include_event_types": ["$pageview", "custom_event"], + "include_custom_events": [], + "exclude_events": ["/custom1", "/custom3"], + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + {"source": "3_/3", "target": "4_/custom2", "value": 1, "average_conversion_time": 6 * ONE_MINUTE}, + ], + ) + + def test_path_respect_session_limits(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["fake"]) + events = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-02 03:21:54", # new day, new session + ), + _create_event( + properties={"$current_url": "/2/"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-02 03:22:54", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-02 03:26:54", + ), + ] + + _create_all_events(events) + + filter = PathFilter(data={"date_from": "2012-01-01"}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 2, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 2, "average_conversion_time": 3 * ONE_MINUTE}, + ], + ) + + def test_path_removes_duplicates(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["fake"]) + p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:54", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/2/"}, # trailing slash should be removed + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:54", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:54", + ), + ] + + Person.objects.create(team_id=self.team.pk, distinct_ids=["fake2"]) + p2 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="fake2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2/"}, + distinct_id="fake2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:23:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="fake2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:27:34", + ), + ] + + _create_all_events([*p1, *p2]) + + filter = PathFilter(data={"date_from": "2012-01-01"}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/2", "value": 2, "average_conversion_time": 1.5 * ONE_MINUTE}, + {"source": "2_/2", "target": "3_/3", "value": 2, "average_conversion_time": 3 * ONE_MINUTE}, + ], + ) + + @test_with_materialized_columns(["$current_url", "$screen_name"]) + def test_paths_start_and_end(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_1"]) + events_p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:04:00", + ), + _create_event( + properties={"$current_url": "/5"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:05:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:06:00", + ), + _create_event( + properties={"$current_url": "/after"}, + distinct_id="person_1", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:07:00", + ), + ] + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_2"]) + events_p2 = [ + _create_event( + properties={"$current_url": "/5"}, + distinct_id="person_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + ] + + p3 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_3"]) + events_p3 = [ + _create_event( + properties={"$current_url": "/3"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + _create_event( + properties={"$current_url": "/after"}, + distinct_id="person_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:04:00", + ), + ] + + _create_all_events([*events_p1, *events_p2, *events_p3]) + + filter = PathFilter( + data={ + "path_type": "$pageview", + "start_point": "/5", + "end_point": "/about", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + }, + team=self.team, + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter,) + self.assertEqual( + response, [{"source": "1_/5", "target": "2_/about", "value": 2, "average_conversion_time": 60000.0}] + ) + self.assertCountEqual(self._get_people_at_path(filter, "1_/5", "2_/about"), [p1.uuid, p2.uuid]) + + # test aggregation for long paths + filter = filter.with_data({"start_point": "/2", "step_limit": 4}) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter,) + self.assertEqual( + response, + [ + {"source": "1_/2", "target": "2_/3", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/3", "target": "3_...", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "3_...", "target": "4_/5", "value": 1, "average_conversion_time": ONE_MINUTE}, + {"source": "4_/5", "target": "5_/about", "value": 1, "average_conversion_time": ONE_MINUTE}, + ], + ) + self.assertCountEqual(self._get_people_at_path(filter, "3_...", "4_/5"), [p1.uuid]) + + def test_properties_queried_using_path_filter(self): + def should_query_list(filter) -> Tuple[bool, bool]: + path_query = PathEventQuery(filter, self.team.id) + return (path_query._should_query_url(), path_query._should_query_screen()) + + filter = PathFilter() + self.assertEqual(should_query_list(filter), (True, True)) + + filter = PathFilter({"include_event_types": ["$pageview"]}) + self.assertEqual(should_query_list(filter), (True, False)) + + filter = PathFilter({"include_event_types": ["$screen"]}) + self.assertEqual(should_query_list(filter), (False, True)) + + filter = filter.with_data({"include_event_types": [], "include_custom_events": ["/custom1", "/custom2"]}) + self.assertEqual(should_query_list(filter), (False, False)) + + filter = filter.with_data( + {"include_event_types": ["$pageview", "$screen", "custom_event"], "include_custom_events": []} + ) + self.assertEqual(should_query_list(filter), (True, True)) + + filter = filter.with_data( + { + "include_event_types": ["$pageview", "$screen", "custom_event"], + "include_custom_events": [], + "exclude_events": ["/custom1"], + } + ) + self.assertEqual(should_query_list(filter), (True, True)) + + filter = filter.with_data( + {"include_event_types": [], "include_custom_events": [], "exclude_events": ["$pageview"],} + ) + self.assertEqual(should_query_list(filter), (False, True)) + + def test_path_grouping_across_people(self): + + # P1 for pageview event /2/bar/1/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"]) + p1 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2/bar/1/foo"}, # regex matches, despite beginning with `/2/` + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P2 for pageview event /bar/2/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"]) + p2 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/bar/2/foo"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P3 for pageview event /bar/3/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p3"]) + p3 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p3", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/bar/33/foo"}, + distinct_id="p3", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p3", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + _create_all_events([*p1, *p2, *p3]) + + filter = PathFilter( + data={ + "step_limit": 4, + "date_from": "2012-01-01", + "include_event_types": ["$pageview"], + "path_groupings": ["/bar/*/foo"], + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/bar/*/foo", "value": 3, "average_conversion_time": ONE_MINUTE}, + {"source": "2_/bar/*/foo", "target": "3_/3", "value": 3, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + def test_path_grouping_with_evil_input(self): + + evil_string = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa!" + # P1 for pageview event /2/bar/1/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"]) + p1 = [ + _create_event( + properties={"$current_url": evil_string}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2/bar/aaa"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id="p1", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + # P2 for pageview event /2/bar/2/foo + Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"]) + p2 = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:21:34", + ), + _create_event( + properties={"$current_url": "/2/3?q=1"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:22:34", + ), + _create_event( + properties={"$current_url": "/3?q=1"}, + distinct_id="p2", + event="$pageview", + team=self.team, + timestamp="2012-01-01 03:24:34", + ), + ] + + _create_all_events([*p1, *p2]) + + filter = PathFilter( + data={ + "date_from": "2012-01-01", + "include_event_types": ["$pageview"], + "path_groupings": ["(a+)+", "[aaa|aaaa]+", "1.*", ".*", "/3?q=1", "/3*"], + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter) + self.assertEqual( + response, + [ + {"source": "1_/1", "target": "2_/3*", "value": 1, "average_conversion_time": 3 * ONE_MINUTE}, + { + "source": f"1_{evil_string}", + "target": "2_/2/bar/aaa", + "value": 1, + "average_conversion_time": ONE_MINUTE, + }, + {"source": "2_/2/bar/aaa", "target": "3_/3*", "value": 1, "average_conversion_time": 2 * ONE_MINUTE}, + ], + ) + + def test_paths_person_dropoffs(self): + events = [] + + # 5 people do 2 events + for i in range(5): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + two_step = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:04:00", + properties={}, + ), + ] + events.extend(two_step) + + # 10 people do 3 events + for i in range(5, 15): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + three_step = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:04:00", + properties={}, + ), + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:05:00", + properties={}, + ), + ] + events.extend(three_step) + + # 20 people do 4 events + for i in range(15, 35): + Person.objects.create(distinct_ids=[f"user_{i}"], team=self.team) + four_step = [ + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={}, + ), + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:04:00", + properties={}, + ), + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:05:00", + properties={}, + ), + _create_event( + event="step four", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:06:00", + properties={}, + ), + ] + events.extend(four_step) + + _create_all_events(events) + + filter = PathFilter( + data={ + "include_event_types": ["custom_event"], + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + } + ) + self.assertEqual(5, len(self._get_people_at_path(filter, path_dropoff="2_step two"))) # 5 dropoff at step 2 + self.assertEqual(35, len(self._get_people_at_path(filter, path_end="2_step two"))) # 35 total reach step 2 + self.assertEqual( + 30, len(self._get_people_at_path(filter, path_start="2_step two")) + ) # 30 total reach after step 2 + + self.assertEqual(10, len(self._get_people_at_path(filter, path_dropoff="3_step three"))) # 10 dropoff at step 3 + self.assertEqual(30, len(self._get_people_at_path(filter, path_end="3_step three"))) # 30 total reach step 3 + self.assertEqual( + 20, len(self._get_people_at_path(filter, path_start="3_step three")) + ) # 20 total reach after step 3 + + self.assertEqual(20, len(self._get_people_at_path(filter, path_dropoff="4_step four"))) # 20 dropoff at step 4 + self.assertEqual(20, len(self._get_people_at_path(filter, path_end="4_step four"))) # 20 total reach step 4 + self.assertEqual( + 0, len(self._get_people_at_path(filter, path_start="4_step four")) + ) # 0 total reach after step 4 + + def test_paths_start_dropping_orphaned_edges(self): + events = [] + for i in range(5): + # 5 people going through this route to increase weights + Person.objects.create(team_id=self.team.pk, distinct_ids=[f"person_{i}"]) + special_route = [ + _create_event( + properties={"$current_url": "/1"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/2"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/3"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + _create_event( + properties={"$current_url": "/4"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:04:00", + ), + _create_event( + properties={"$current_url": "/5"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:05:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:06:00", + ), + _create_event( + properties={"$current_url": "/after"}, + distinct_id=f"person_{i}", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:07:00", + ), + ] + events.extend(special_route) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_r_2"]) + events_p2 = [ + _create_event( + properties={"$current_url": "/2"}, + distinct_id="person_r_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/a"}, + distinct_id="person_r_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:30", + ), + _create_event( + properties={"$current_url": "/x"}, + distinct_id="person_r_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_r_2", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + ] + events.extend(events_p2) + + p3 = Person.objects.create(team_id=self.team.pk, distinct_ids=["person_r_3"]) + event_p3 = [ + _create_event( + properties={"$current_url": "/2"}, + distinct_id="person_r_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:00", + ), + _create_event( + properties={"$current_url": "/b"}, + distinct_id="person_r_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:01:30", + ), + _create_event( + properties={"$current_url": "/x"}, + distinct_id="person_r_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:02:00", + ), + _create_event( + properties={"$current_url": "/about"}, + distinct_id="person_r_3", + event="$pageview", + team=self.team, + timestamp="2021-05-01 00:03:00", + ), + ] + + events.extend(event_p3) + _create_all_events(events) + + # /x -> /about has higher weight than /2 -> /a -> /x and /2 -> /b -> /x + + filter = PathFilter( + data={ + "path_type": "$pageview", + "start_point": "/2", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "edge_limit": "6", + } + ) + response = ClickhousePaths(team=self.team, filter=filter).run(team=self.team, filter=filter,) + self.assertEqual( + response, + [ + {"source": "1_/2", "target": "2_/3", "value": 5, "average_conversion_time": 60000.0}, + {"source": "2_/3", "target": "3_/4", "value": 5, "average_conversion_time": 60000.0}, + {"source": "3_/4", "target": "4_/5", "value": 5, "average_conversion_time": 60000.0}, + {"source": "4_/5", "target": "5_/about", "value": 5, "average_conversion_time": 60000.0}, + # {'source': '3_/x', 'target': '4_/about', 'value': 2, 'average_conversion_time': 60000.0}, # gets deleted by validation since dangling + {"source": "1_/2", "target": "2_/a", "value": 1, "average_conversion_time": 30000.0}, + ], + ) + + def test_path_min_edge_weight(self): + # original data from test_path_by_grouping.py + self._create_sample_data_multiple_dropoffs() + data = { + "insight": INSIGHT_FUNNELS, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "min_edge_weight": 15, + "path_groupings": ["between_step_1_*", "between_step_2_*", "step drop*"], + } + path_filter = PathFilter(data=data) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertCountEqual( + response, + [ + { + "source": "1_step one", + "target": "2_step drop*", + "value": 20, + "average_conversion_time": 2 * ONE_MINUTE, + }, + # when we group events for a single user, these effectively become duplicate events, and we choose the last event from + # a list of duplicate events. + { + "source": "1_step one", + "target": "2_between_step_1_*", + "value": 15, + "average_conversion_time": (5 * 3 + 10 * 2) + * ONE_MINUTE + / 15, # first 5 go till between_step_1_c, next 10 go till between_step_1_b + }, + { + "source": "2_between_step_1_*", + "target": "3_step two", + "value": 15, + "average_conversion_time": ONE_MINUTE, + }, + ], + ) + + path_filter = path_filter.with_data({"edge_limit": 2}) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertCountEqual( + response, + [ + { + "source": "1_step one", + "target": "2_step drop*", + "value": 20, + "average_conversion_time": 2 * ONE_MINUTE, + }, + # when we group events for a single user, these effectively become duplicate events, and we choose the last event from + # a list of duplicate events. + { + "source": "1_step one", + "target": "2_between_step_1_*", + "value": 15, + "average_conversion_time": (5 * 3 + 10 * 2) + * ONE_MINUTE + / 15, # first 5 go till between_step_1_c, next 10 go till between_step_1_b + }, + ], + ) + + path_filter = path_filter.with_data({"edge_limit": 20, "max_edge_weight": 11, "min_edge_weight": 6}) + response = ClickhousePaths(team=self.team, filter=path_filter).run() + self.assertCountEqual( + response, + [ + { + "source": "2_step drop*", + "target": "3_step branch", + "value": 10, + "average_conversion_time": ONE_MINUTE, + }, + { + "source": "3_step two", + "target": "4_between_step_2_*", + "value": 10, + "average_conversion_time": 160000, + }, + ], + ) + + +class TestClickhousePathsEdgeValidation(TestCase): + + BASIC_PATH = [("1_a", "2_b"), ("2_b", "3_c"), ("3_c", "4_d")] # a->b->c->d + BASIC_PATH_2 = [("1_x", "2_y"), ("2_y", "3_z")] # x->y->z + + def test_basic_forest(self): + edges = self.BASIC_PATH + self.BASIC_PATH_2 + + results = ClickhousePaths(PathFilter(), MagicMock()).validate_results(edges) + + self.assertCountEqual(results, self.BASIC_PATH + self.BASIC_PATH_2) + + def test_basic_forest_with_dangling_edges(self): + edges = self.BASIC_PATH + self.BASIC_PATH_2 + [("2_w", "3_z"), ("3_x", "4_d"), ("2_xxx", "3_yyy")] + + results = ClickhousePaths(PathFilter(), MagicMock()).validate_results(edges) + + self.assertCountEqual(results, self.BASIC_PATH + self.BASIC_PATH_2) + + def test_basic_forest_with_dangling_and_cross_edges(self): + edges = self.BASIC_PATH + self.BASIC_PATH_2 + [("2_w", "3_z"), ("3_x", "4_d"), ("2_y", "3_c")] + + results = ClickhousePaths(PathFilter(), MagicMock()).validate_results(edges) + self.assertCountEqual(results, self.BASIC_PATH + self.BASIC_PATH_2 + [("2_y", "3_c")]) -def _create_event(**kwargs): - kwargs.update({"event_uuid": uuid4()}) - create_event(**kwargs) + def test_no_start_point(self): + edges = set(self.BASIC_PATH + self.BASIC_PATH_2 + [("2_w", "3_z"), ("3_x", "4_d")]) + edges.remove(("1_a", "2_b")) # remove first start point + edges = list(edges) # type: ignore + results = ClickhousePaths(PathFilter(), MagicMock()).validate_results(edges) -class TestClickhousePaths(ClickhouseTestMixin, paths_test_factory(ClickhousePaths, _create_event, Person.objects.create)): # type: ignore - pass + self.assertCountEqual(results, self.BASIC_PATH_2) diff --git a/ee/clickhouse/queries/test/test_person_query.py b/ee/clickhouse/queries/test/test_person_query.py new file mode 100644 index 0000000000000..3ea73f5c77505 --- /dev/null +++ b/ee/clickhouse/queries/test/test_person_query.py @@ -0,0 +1,99 @@ +import pytest + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from posthog.models.filters import Filter +from posthog.models.person import Person +from posthog.models.team import Team + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def person_query(team: Team, filter: Filter, **kwargs): + return ClickhousePersonQuery(filter, team.pk, **kwargs).get_query()[0] + + +def run_query(team: Team, filter: Filter, **kwargs): + query, params = ClickhousePersonQuery(filter, team.pk, **kwargs).get_query() + rows = sync_execute(query, {**params, "team_id": team.pk}) + + if len(rows) > 0: + return {"rows": len(rows), "columns": len(rows[0])} + else: + return {"rows": 0} + + +@pytest.fixture +def testdata(db, team): + materialize("person", "email") + _create_person(distinct_ids=["1"], team_id=team.pk, properties={"email": "tim@posthog.com"}) + _create_person(distinct_ids=["2"], team_id=team.pk, properties={"email": "marius@posthog.com"}) + _create_person(distinct_ids=["3"], team_id=team.pk, properties={"email": "karl@example.com"}) + + +def test_person_query(testdata, team, snapshot): + filter = Filter(data={"properties": []}) + + assert person_query(team, filter) == snapshot + assert run_query(team, filter) == {"rows": 3, "columns": 1} + + filter = Filter( + data={ + "properties": [ + {"key": "event_prop", "value": "value"}, + {"key": "email", "type": "person", "value": "posthog", "operator": "icontains"}, + ], + } + ) + + assert person_query(team, filter) == snapshot + assert run_query(team, filter) == {"rows": 2, "columns": 1} + + +def test_person_query_with_extra_requested_fields(testdata, team, snapshot): + filter = Filter( + data={ + "properties": [{"key": "email", "type": "person", "value": "posthog", "operator": "icontains"},], + "breakdown": "person_prop_4326", + "breakdown_type": "person", + }, + ) + + assert person_query(team, filter) == snapshot + assert run_query(team, filter) == {"rows": 2, "columns": 2} + + filter = filter.with_data({"breakdown": "email", "breakdown_type": "person"}) + assert person_query(team, filter) == snapshot + assert run_query(team, filter) == {"rows": 2, "columns": 2} + + +def test_person_query_with_entity_filters(testdata, team, snapshot): + filter = Filter( + data={ + "events": [ + { + "id": "$pageview", + "properties": [{"key": "email", "type": "person", "value": "karl", "operator": "icontains"}], + } + ] + } + ) + + assert person_query(team, filter) == snapshot + assert run_query(team, filter) == {"rows": 3, "columns": 2} + + assert person_query(team, filter, entity=filter.entities[0]) == snapshot + assert run_query(team, filter, entity=filter.entities[0]) == {"rows": 1, "columns": 1} + + +def test_person_query_with_extra_fields(testdata, team, snapshot): + filter = Filter( + data={"properties": [{"key": "email", "type": "person", "value": "posthog", "operator": "icontains"},]}, + ) + + assert person_query(team, filter, extra_fields=["person_props", "pmat_email"]) == snapshot + assert run_query(team, filter, extra_fields=["person_props", "pmat_email"]) == {"rows": 2, "columns": 3} diff --git a/ee/clickhouse/queries/test/test_retention.py b/ee/clickhouse/queries/test/test_retention.py index e60f5453d4c66..559e2718cb558 100644 --- a/ee/clickhouse/queries/test/test_retention.py +++ b/ee/clickhouse/queries/test/test_retention.py @@ -4,11 +4,14 @@ import pytz from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.group import create_group from ee.clickhouse.queries.clickhouse_retention import ClickhouseRetention -from ee.clickhouse.util import ClickhouseTestMixin +from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries from posthog.models.action import Action from posthog.models.action_step import ActionStep -from posthog.models.filter import Filter +from posthog.models.filters import Filter +from posthog.models.filters.retention_filter import RetentionFilter +from posthog.models.group_type_mapping import GroupTypeMapping from posthog.models.person import Person from posthog.queries.test.test_retention import retention_test_factory @@ -26,5 +29,118 @@ def _create_action(**kwargs): return action -class TestClickhouseRetention(ClickhouseTestMixin, retention_test_factory(ClickhouseRetention, _create_event, Person.objects.create, _create_action)): # type: ignore - pass +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return person + + +class TestClickhouseRetention(ClickhouseTestMixin, retention_test_factory(ClickhouseRetention, _create_event, _create_person, _create_action)): # type: ignore + def _create_groups_and_events(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"}) + + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:1", properties={}) + create_group(team_id=self.team.pk, group_type_index=1, group_key="company:2", properties={}) + + Person.objects.create(team=self.team, distinct_ids=["person1", "alias1"]) + Person.objects.create(team=self.team, distinct_ids=["person2"]) + Person.objects.create(team=self.team, distinct_ids=["person3"]) + + self._create_events( + [ + ("person1", self._date(0), {"$group_0": "org:5", "$group_1": "company:1"}), + ("person2", self._date(0), {"$group_0": "org:6"}), + ("person3", self._date(0)), + ("person1", self._date(1), {"$group_0": "org:5"}), + ("person2", self._date(1), {"$group_0": "org:6"}), + ("person1", self._date(7), {"$group_0": "org:5"}), + ("person2", self._date(7), {"$group_0": "org:6"}), + ("person1", self._date(14), {"$group_0": "org:5"}), + ("person1", self._date(month=1, day=-6), {"$group_0": "org:5", "$group_1": "company:1"}), + ("person2", self._date(month=1, day=-6), {"$group_0": "org:6"}), + ("person2", self._date(month=1, day=1), {"$group_0": "org:6"}), + ("person1", self._date(month=1, day=1), {"$group_0": "org:5"}), + ("person2", self._date(month=1, day=15), {"$group_0": "org:6", "$group_1": "company:1"}), + ] + ) + + @snapshot_clickhouse_queries + def test_groups_filtering(self): + self._create_groups_and_events() + + result = ClickhouseRetention().run( + RetentionFilter( + data={ + "date_to": self._date(10, month=1, hour=0), + "period": "Week", + "total_intervals": 7, + "properties": [{"key": "industry", "value": "technology", "type": "group", "group_type_index": 0}], + }, + team=self.team, + ), + self.team, + ) + + self.assertEqual( + self.pluck(result, "values", "count"), + [[1, 1, 0, 1, 1, 0, 1], [1, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0], [1, 1, 0, 1], [1, 0, 1], [0, 0], [1],], + ) + + result = ClickhouseRetention().run( + RetentionFilter( + data={ + "date_to": self._date(10, month=1, hour=0), + "period": "Week", + "total_intervals": 7, + "properties": [ + {"key": "industry", "value": "", "type": "group", "group_type_index": 0, "operator": "is_set"} + ], + }, + team=self.team, + ), + self.team, + ) + + self.assertEqual( + self.pluck(result, "values", "count"), + [[2, 2, 1, 2, 2, 0, 1], [2, 1, 2, 2, 0, 1], [1, 1, 1, 0, 0], [2, 2, 0, 1], [2, 0, 1], [0, 0], [1],], + ) + + @snapshot_clickhouse_queries + def test_groups_aggregating(self): + self._create_groups_and_events() + + filter = RetentionFilter( + data={ + "date_to": self._date(10, month=1, hour=0), + "period": "Week", + "total_intervals": 7, + "aggregation_group_type_index": 0, + }, + team=self.team, + ) + + result = ClickhouseRetention().run(filter, self.team) + self.assertEqual( + self.pluck(result, "values", "count"), + [[2, 2, 1, 2, 2, 0, 1], [2, 1, 2, 2, 0, 1], [1, 1, 1, 0, 0], [2, 2, 0, 1], [2, 0, 1], [0, 0], [1],], + ) + + filter = RetentionFilter( + data={ + "date_to": self._date(10, month=1, hour=0), + "period": "Week", + "total_intervals": 7, + "aggregation_group_type_index": 1, + }, + team=self.team, + ) + + result = ClickhouseRetention().run(filter, self.team) + self.assertEqual( + self.pluck(result, "values", "count"), + [[1, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [1, 0, 0, 1], [0, 0, 0], [0, 0], [1],], + ) diff --git a/ee/clickhouse/queries/test/test_session_recording.py b/ee/clickhouse/queries/test/test_session_recording.py index 6188f688a8699..80cfe4ff97d4b 100644 --- a/ee/clickhouse/queries/test/test_session_recording.py +++ b/ee/clickhouse/queries/test/test_session_recording.py @@ -1,9 +1,9 @@ from uuid import uuid4 from ee.clickhouse.models.session_recording_event import create_session_recording_event -from ee.clickhouse.queries.clickhouse_session_recording import SessionRecording, add_session_recording_ids +from ee.clickhouse.queries.clickhouse_session_recording import SessionRecording, join_with_session_recordings from ee.clickhouse.util import ClickhouseTestMixin -from posthog.queries.test.test_session_recording import session_recording_test_factory +from posthog.queries.sessions.test.test_sessions_recording import session_recording_test_factory def _create_event(**kwargs): @@ -13,6 +13,6 @@ def _create_event(**kwargs): class TestClickhouseSessionRecording( - ClickhouseTestMixin, session_recording_test_factory(SessionRecording, add_session_recording_ids, _create_event) # type: ignore + ClickhouseTestMixin, session_recording_test_factory(SessionRecording, join_with_session_recordings, _create_event) # type: ignore ): pass diff --git a/ee/clickhouse/queries/test/test_sessions.py b/ee/clickhouse/queries/test/test_sessions.py index 01c62dd0dac3b..46d1fab928717 100644 --- a/ee/clickhouse/queries/test/test_sessions.py +++ b/ee/clickhouse/queries/test/test_sessions.py @@ -1,15 +1,28 @@ from uuid import uuid4 from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.session_recording_event import create_session_recording_event from ee.clickhouse.queries.sessions.clickhouse_sessions import ClickhouseSessions +from ee.clickhouse.queries.sessions.list import ClickhouseSessionsList from ee.clickhouse.util import ClickhouseTestMixin -from posthog.queries.test.test_sessions import sessions_test_factory +from posthog.models.person import Person +from posthog.queries.sessions.test.test_sessions import sessions_test_factory +from posthog.queries.sessions.test.test_sessions_list import sessions_list_test_factory def _create_event(**kwargs): - kwargs.update({"event_uuid": uuid4()}) - create_event(**kwargs) + create_event(event_uuid=uuid4(), **kwargs) -class TestClickhouseSessions(ClickhouseTestMixin, sessions_test_factory(ClickhouseSessions, _create_event)): # type: ignore +def _create_session_recording_event(**kwargs): + create_session_recording_event( + uuid=uuid4(), **kwargs, + ) + + +class TestClickhouseSessions(ClickhouseTestMixin, sessions_test_factory(ClickhouseSessions, _create_event, Person.objects.create)): # type: ignore + pass + + +class TestClickhouseSessionsList(ClickhouseTestMixin, sessions_list_test_factory(ClickhouseSessionsList, _create_event, _create_session_recording_event)): # type: ignore pass diff --git a/ee/clickhouse/queries/test/test_stickiness.py b/ee/clickhouse/queries/test/test_stickiness.py index 62bdf76b98a9e..78a83e0591e97 100644 --- a/ee/clickhouse/queries/test/test_stickiness.py +++ b/ee/clickhouse/queries/test/test_stickiness.py @@ -2,6 +2,7 @@ from ee.clickhouse.models.event import create_event from ee.clickhouse.queries.clickhouse_stickiness import ClickhouseStickiness +from ee.clickhouse.queries.util import get_earliest_timestamp from ee.clickhouse.util import ClickhouseTestMixin from posthog.models.action import Action from posthog.models.action_step import ActionStep @@ -23,5 +24,10 @@ def _create_event(**kwargs): create_event(**kwargs) -class TestClickhouseStickiness(ClickhouseTestMixin, stickiness_test_factory(ClickhouseStickiness, _create_event, Person.objects.create, _create_action)): # type: ignore +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid) + + +class TestClickhouseStickiness(ClickhouseTestMixin, stickiness_test_factory(ClickhouseStickiness, _create_event, _create_person, _create_action, get_earliest_timestamp)): # type: ignore pass diff --git a/ee/clickhouse/queries/test/test_trends.py b/ee/clickhouse/queries/test/test_trends.py index a2bd14a5d42b6..cb7f4b377ff0e 100644 --- a/ee/clickhouse/queries/test/test_trends.py +++ b/ee/clickhouse/queries/test/test_trends.py @@ -1,23 +1,32 @@ from uuid import uuid4 +from django.utils import timezone from freezegun import freeze_time +from rest_framework.exceptions import ValidationError from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.group import create_group +from ee.clickhouse.models.person import create_person_distinct_id from ee.clickhouse.queries.trends.clickhouse_trends import ClickhouseTrends -from ee.clickhouse.util import ClickhouseTestMixin +from ee.clickhouse.queries.trends.person import TrendsPersonQuery +from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries +from posthog.constants import TRENDS_BAR_VALUE from posthog.models.action import Action from posthog.models.action_step import ActionStep from posthog.models.cohort import Cohort -from posthog.models.filter import Filter +from posthog.models.filters import Filter +from posthog.models.group_type_mapping import GroupTypeMapping from posthog.models.person import Person from posthog.queries.test.test_trends import trend_test_factory +from posthog.test.base import test_with_materialized_columns def _create_action(**kwargs): team = kwargs.pop("team") name = kwargs.pop("name") + properties = kwargs.pop("properties", {}) action = Action.objects.create(team=team, name=name) - ActionStep.objects.create(action=action, event=name) + ActionStep.objects.create(action=action, event=name, properties=properties) return action @@ -25,7 +34,7 @@ def _create_cohort(**kwargs): team = kwargs.pop("team") name = kwargs.pop("name") groups = kwargs.pop("groups") - cohort = Cohort.objects.create(team=team, name=name, groups=groups) + cohort = Cohort.objects.create(team=team, name=name, groups=groups, last_calculation=timezone.now()) return cohort @@ -34,9 +43,197 @@ def _create_event(**kwargs): create_event(**kwargs) +# override tests from test facotry if intervals are different class TestClickhouseTrends(ClickhouseTestMixin, trend_test_factory(ClickhouseTrends, _create_event, Person.objects.create, _create_action, _create_cohort)): # type: ignore - def test_breakdown_by_person_property(self): - person1, person2, person3, person4 = self._create_multiple_people() + + maxDiff = None + + def _get_trend_people(self, filter, entity): + result = TrendsPersonQuery(filter=filter, entity=entity, team=self.team).get_people() + return result + + def _create_groups(self): + GroupTypeMapping.objects.create(team=self.team, group_type="organization", group_type_index=0) + GroupTypeMapping.objects.create(team=self.team, group_type="company", group_type_index=1) + + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:5", properties={"industry": "finance"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:6", properties={"industry": "technology"}) + create_group(team_id=self.team.pk, group_type_index=0, group_key="org:7", properties={"industry": "finance"}) + create_group( + team_id=self.team.pk, group_type_index=1, group_key="company:10", properties={"industry": "finance"} + ) + + @test_with_materialized_columns(["key"]) + def test_breakdown_with_filter(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["person1"], properties={"email": "test@posthog.com"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["person2"], properties={"email": "test@gmail.com"}) + _create_event(event="sign up", distinct_id="person1", team=self.team, properties={"key": "val"}) + _create_event(event="sign up", distinct_id="person2", team=self.team, properties={"key": "oh"}) + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-14d", + "breakdown": "key", + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,}], + "properties": [{"key": "key", "value": "oh", "operator": "not_icontains"}], + } + ), + self.team, + ) + self.assertEqual(len(response), 1) + # don't return none option when empty + self.assertEqual(response[0]["breakdown_value"], "val") + + @snapshot_clickhouse_queries + def test_breakdown_with_filter_groups(self): + self._create_groups() + + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"key": "uh", "$group_0": "org:5"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"key": "uh", "$group_0": "org:6"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"key": "oh", "$group_0": "org:7", "$group_1": "company:10"}, + timestamp="2020-01-02T12:00:00Z", + ) + + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "breakdown": "key", + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,}], + "properties": [{"key": "industry", "value": "finance", "type": "group", "group_type_index": 0}], + } + ), + self.team, + ) + + self.assertEqual(len(response), 2) + self.assertEqual(response[0]["breakdown_value"], "oh") + self.assertEqual(response[0]["count"], 1) + self.assertEqual(response[1]["breakdown_value"], "uh") + self.assertEqual(response[1]["count"], 1) + + @snapshot_clickhouse_queries + def test_breakdown_by_group_props(self): + self._create_groups() + + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:5"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:6"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:7", "$group_1": "company:10"}, + timestamp="2020-01-02T12:00:00Z", + ) + + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "breakdown": "industry", + "breakdown_type": "group", + "breakdown_group_type_index": 0, + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,}], + } + ), + self.team, + ) + + self.assertEqual(len(response), 2) + self.assertEqual(response[0]["breakdown_value"], "finance") + self.assertEqual(response[0]["count"], 2) + self.assertEqual(response[1]["breakdown_value"], "technology") + self.assertEqual(response[1]["count"], 1) + + @snapshot_clickhouse_queries + def test_breakdown_by_group_props_with_person_filter(self): + self._create_groups() + + Person.objects.create(team_id=self.team.pk, distinct_ids=["person1"], properties={"key": "value"}) + + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:5"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="sign up", + distinct_id="person2", + team=self.team, + properties={"$group_0": "org:6"}, + timestamp="2020-01-02T12:00:00Z", + ) + + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "breakdown": "industry", + "breakdown_type": "group", + "breakdown_group_type_index": 0, + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,}], + "properties": [{"key": "key", "value": "value", "type": "person"}], + } + ), + self.team, + ) + + self.assertEqual(len(response), 1) + self.assertEqual(response[0]["breakdown_value"], "finance") + self.assertEqual(response[0]["count"], 1) + + @test_with_materialized_columns(["$some_property"]) + def test_breakdown_filtering_limit(self): + self._create_breakdown_events() + with freeze_time("2020-01-04T13:01:01Z"): + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-14d", + "breakdown": "$some_property", + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0}], + } + ), + self.team, + ) + self.assertEqual(len(response), 25) # We fetch 25 to see if there are more ethan 20 values + + @test_with_materialized_columns(event_properties=["order"], person_properties=["name"]) + def test_breakdown_with_person_property_filter(self): + self._create_multiple_people() action = _create_action(name="watched movie", team=self.team) with freeze_time("2020-01-04T13:01:01Z"): @@ -44,9 +241,9 @@ def test_breakdown_by_person_property(self): Filter( data={ "date_from": "-14d", - "breakdown": "name", - "breakdown_type": "person", + "breakdown": "order", "actions": [{"id": action.pk, "type": "actions", "order": 0}], + "properties": [{"key": "name", "value": "person2", "type": "person"}], } ), self.team, @@ -55,27 +252,26 @@ def test_breakdown_by_person_property(self): Filter( data={ "date_from": "-14d", - "breakdown": "name", - "breakdown_type": "person", - "events": [{"id": "watched movie", "name": "watched movie", "type": "events", "order": 0,}], + "breakdown": "order", + "events": [ + { + "id": "watched movie", + "name": "watched movie", + "type": "events", + "order": 0, + "properties": [{"key": "name", "value": "person2", "type": "person"}], + } + ], } ), self.team, ) - self.assertListEqual([res["breakdown_value"] for res in event_response], ["person1", "person2", "person3"]) - - for response in event_response: - if response["breakdown_value"] == "person1": - self.assertEqual(response["count"], 1) - self.assertEqual(response["label"], "watched movie - person1") - if response["breakdown_value"] == "person2": - self.assertEqual(response["count"], 3) - if response["breakdown_value"] == "person3": - self.assertEqual(response["count"], 3) - - self.assertTrue(self._compare_entity_response(event_response, action_response,)) + self.assertDictContainsSubset({"count": 2, "breakdown_value": "2",}, event_response[0]) + self.assertDictContainsSubset({"count": 1, "breakdown_value": "1",}, event_response[1]) + self.assertEntityResponseEqual(event_response, action_response) + @test_with_materialized_columns(["$some_property"]) def test_breakdown_filtering(self): self._create_events() # test breakdown filtering @@ -94,17 +290,79 @@ def test_breakdown_filtering(self): self.team, ) - self.assertEqual(response[0]["label"], "sign up - value") - self.assertEqual(response[1]["label"], "sign up - other_value") - self.assertEqual(response[2]["label"], "no events - value") - self.assertEqual(response[3]["label"], "no events - other_value") + self.assertEqual(response[0]["label"], "sign up - none") + self.assertEqual(response[1]["label"], "sign up - value") + self.assertEqual(response[2]["label"], "sign up - other_value") + self.assertEqual(response[3]["label"], "no events - none") self.assertEqual(sum(response[0]["data"]), 2) - self.assertEqual(response[0]["breakdown_value"], "value") + self.assertEqual(sum(response[1]["data"]), 2) + self.assertEqual(sum(response[2]["data"]), 1) + self.assertEqual(sum(response[3]["data"]), 1) - self.assertEqual(sum(response[1]["data"]), 1) - self.assertEqual(response[1]["breakdown_value"], "other_value") + @test_with_materialized_columns(person_properties=["email"]) + def test_breakdown_filtering_persons(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["person1"], properties={"email": "test@posthog.com"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["person2"], properties={"email": "test@gmail.com"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["person3"], properties={}) + + _create_event(event="sign up", distinct_id="person1", team=self.team, properties={"key": "val"}) + _create_event(event="sign up", distinct_id="person2", team=self.team, properties={"key": "val"}) + _create_event(event="sign up", distinct_id="person3", team=self.team, properties={"key": "val"}) + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-14d", + "breakdown": "email", + "breakdown_type": "person", + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,},], + } + ), + self.team, + ) + self.assertEqual(response[0]["label"], "sign up - none") + self.assertEqual(response[1]["label"], "sign up - test@gmail.com") + self.assertEqual(response[2]["label"], "sign up - test@posthog.com") + + self.assertEqual(response[0]["count"], 1) + self.assertEqual(response[1]["count"], 1) + self.assertEqual(response[2]["count"], 1) + + # ensure that column names are properly handled when subqueries and person subquery share properties column + @test_with_materialized_columns(event_properties=["key"], person_properties=["email"]) + def test_breakdown_filtering_persons_with_action_props(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["person1"], properties={"email": "test@posthog.com"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["person2"], properties={"email": "test@gmail.com"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["person3"], properties={}) + + _create_event(event="sign up", distinct_id="person1", team=self.team, properties={"key": "val"}) + _create_event(event="sign up", distinct_id="person2", team=self.team, properties={"key": "val"}) + _create_event(event="sign up", distinct_id="person3", team=self.team, properties={"key": "val"}) + action = _create_action( + name="sign up", + team=self.team, + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-14d", + "breakdown": "email", + "breakdown_type": "person", + "actions": [{"id": action.pk, "type": "actions", "order": 0}], + } + ), + self.team, + ) + self.assertEqual(response[0]["label"], "sign up - none") + self.assertEqual(response[1]["label"], "sign up - test@gmail.com") + self.assertEqual(response[2]["label"], "sign up - test@posthog.com") + self.assertEqual(response[0]["count"], 1) + self.assertEqual(response[1]["count"], 1) + self.assertEqual(response[2]["count"], 1) + + @test_with_materialized_columns(["$current_url", "$os", "$browser"]) def test_breakdown_filtering_with_properties(self): with freeze_time("2020-01-03T13:01:01Z"): _create_event( @@ -154,15 +412,17 @@ def test_breakdown_filtering_with_properties(self): self.team, ) - self.assertEqual(response[0]["label"], "sign up - second url") - self.assertEqual(response[1]["label"], "sign up - first url") + response = sorted(response, key=lambda x: x["label"]) + self.assertEqual(response[0]["label"], "sign up - first url") + self.assertEqual(response[1]["label"], "sign up - second url") self.assertEqual(sum(response[0]["data"]), 1) - self.assertEqual(response[0]["breakdown_value"], "second url") + self.assertEqual(response[0]["breakdown_value"], "first url") self.assertEqual(sum(response[1]["data"]), 1) - self.assertEqual(response[1]["breakdown_value"], "first url") + self.assertEqual(response[1]["breakdown_value"], "second url") + @test_with_materialized_columns(["$some_property"]) def test_dau_with_breakdown_filtering(self): sign_up_action, _ = self._create_events() with freeze_time("2020-01-02T13:01:01Z"): @@ -178,12 +438,647 @@ def test_dau_with_breakdown_filtering(self): Filter(data={"breakdown": "$some_property", "events": [{"id": "sign up", "math": "dau"}]}), self.team, ) - self.assertEqual(event_response[0]["label"], "sign up - value") - self.assertEqual(event_response[1]["label"], "sign up - other_value") + self.assertEqual(event_response[1]["label"], "sign up - value") + self.assertEqual(event_response[2]["label"], "sign up - other_value") + + self.assertEqual(sum(event_response[1]["data"]), 1) + self.assertEqual(event_response[1]["data"][4], 1) # property not defined + + self.assertEqual(sum(event_response[2]["data"]), 1) + self.assertEqual(event_response[2]["data"][5], 1) + self.assertEntityResponseEqual(action_response, event_response) + + @test_with_materialized_columns(["$os", "$some_property"]) + def test_dau_with_breakdown_filtering_with_prop_filter(self): + sign_up_action, _ = self._create_events() + with freeze_time("2020-01-02T13:01:01Z"): + _create_event( + team=self.team, + event="sign up", + distinct_id="blabla", + properties={"$some_property": "other_value", "$os": "Windows"}, + ) + with freeze_time("2020-01-04T13:01:01Z"): + action_response = ClickhouseTrends().run( + Filter( + data={ + "breakdown": "$some_property", + "actions": [{"id": sign_up_action.id, "math": "dau"}], + "properties": [{"key": "$os", "value": "Windows"}], + } + ), + self.team, + ) + event_response = ClickhouseTrends().run( + Filter( + data={ + "breakdown": "$some_property", + "events": [{"id": "sign up", "math": "dau"}], + "properties": [{"key": "$os", "value": "Windows"}], + } + ), + self.team, + ) + + self.assertEqual(event_response[0]["label"], "sign up - other_value") self.assertEqual(sum(event_response[0]["data"]), 1) - self.assertEqual(event_response[0]["data"][4], 1) # property not defined + self.assertEqual(event_response[0]["data"][5], 1) # property not defined - self.assertEqual(sum(event_response[1]["data"]), 1) - self.assertEqual(event_response[1]["data"][5], 1) - self.assertTrue(self._compare_entity_response(action_response, event_response)) + self.assertEntityResponseEqual(action_response, event_response) + + @test_with_materialized_columns(event_properties=["$host"], person_properties=["$some_prop"]) + def test_against_clashing_entity_and_property_filter_naming(self): + # Regression test for https://github.com/PostHog/posthog/issues/5814 + Person.objects.create( + team_id=self.team.pk, distinct_ids=["blabla", "anonymous_id"], properties={"$some_prop": "some_val"} + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="blabla", + properties={"$host": "app.example.com"}, + timestamp="2020-01-03T12:00:00Z", + ) + + with freeze_time("2020-01-04T13:01:01Z"): + response = ClickhouseTrends().run( + Filter( + data={ + "events": [ + { + "id": "$pageview", + "properties": [{"key": "$host", "operator": "icontains", "value": ".com"}], + } + ], + "properties": [{"key": "$host", "value": ["app.example.com", "another.com"]}], + "breakdown": "$some_prop", + "breakdown_type": "person", + } + ), + self.team, + ) + + self.assertEqual(response[0]["count"], 1) + + # this ensures that the properties don't conflict when formatting params + @test_with_materialized_columns(["$current_url"]) + def test_action_with_prop(self): + person = Person.objects.create( + team_id=self.team.pk, distinct_ids=["blabla", "anonymous_id"], properties={"$some_prop": "some_val"} + ) + sign_up_action = Action.objects.create(team=self.team, name="sign up") + ActionStep.objects.create( + action=sign_up_action, event="sign up", properties={"$current_url": "https://posthog.com/feedback/1234"} + ) + + with freeze_time("2020-01-02T13:01:01Z"): + _create_event( + team=self.team, + event="sign up", + distinct_id="blabla", + properties={"$current_url": "https://posthog.com/feedback/1234"}, + ) + + with freeze_time("2020-01-04T13:01:01Z"): + action_response = ClickhouseTrends().run( + Filter( + data={ + "actions": [{"id": sign_up_action.id, "math": "dau"}], + "properties": [{"key": "$current_url", "value": "fake"}], + } + ), + self.team, + ) + + # if the params were shared it would be 1 because action would take precedence + self.assertEqual(action_response[0]["count"], 0) + + @test_with_materialized_columns(["$current_url"], verify_no_jsonextract=False) + def test_combine_all_cohort_and_icontains(self): + # This caused some issues with SQL parsing + sign_up_action, _ = self._create_events() + cohort = Cohort.objects.create(team=self.team, name="a", groups=[{"properties": {"key": "value"}}]) + action_response = ClickhouseTrends().run( + Filter( + data={ + "actions": [{"id": sign_up_action.id, "math": "dau"}], + "properties": [{"key": "$current_url", "value": "ii", "operator": "icontains"}], + "breakdown": [cohort.pk, "all"], + "breakdown_type": "cohort", + } + ), + self.team, + ) + self.assertEqual(action_response[0]["count"], 0) + + @test_with_materialized_columns(event_properties=["key"], person_properties=["email"]) + def test_breakdown_user_props_with_filter(self): + Person.objects.create(team_id=self.team.pk, distinct_ids=["person1"], properties={"email": "test@posthog.com"}) + Person.objects.create(team_id=self.team.pk, distinct_ids=["person2"], properties={"email": "test@gmail.com"}) + person = Person.objects.create( + team_id=self.team.pk, distinct_ids=["person3"], properties={"email": "test@gmail.com"} + ) + create_person_distinct_id(self.team.pk, "person1", str(person.uuid)) + + _create_event(event="sign up", distinct_id="person1", team=self.team, properties={"key": "val"}) + _create_event(event="sign up", distinct_id="person2", team=self.team, properties={"key": "val"}) + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-14d", + "breakdown": "email", + "breakdown_type": "person", + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,}], + "properties": [ + {"key": "email", "value": "@posthog.com", "operator": "not_icontains", "type": "person"}, + {"key": "key", "value": "val"}, + ], + } + ), + self.team, + ) + + self.assertEqual(len(response), 1) + self.assertEqual(response[0]["breakdown_value"], "test@gmail.com") + + def _create_active_user_events(self): + p0 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p0"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p0", + timestamp="2020-01-03T12:00:00Z", + properties={"key": "val"}, + ) + + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-10T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"name": "p2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + def test_active_user_math(self): + self._create_active_user_events() + + data = { + "date_from": "2020-01-09T00:00:00Z", + "date_to": "2020-01-16T00:00:00Z", + "events": [{"id": "$pageview", "type": "events", "order": 0, "math": "weekly_active"}], + } + + filter = Filter(data=data) + result = ClickhouseTrends().run(filter, self.team,) + self.assertEqual(result[0]["data"], [3.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + + def test_active_user_math_action(self): + action = _create_action(name="$pageview", team=self.team) + self._create_active_user_events() + + data = { + "date_from": "2020-01-09T00:00:00Z", + "date_to": "2020-01-16T00:00:00Z", + "actions": [{"id": action.id, "type": "actions", "order": 0, "math": "weekly_active"}], + } + + filter = Filter(data=data) + result = ClickhouseTrends().run(filter, self.team,) + self.assertEqual(result[0]["data"], [3.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + + @test_with_materialized_columns(["key"]) + def test_breakdown_active_user_math(self): + + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-10T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"name": "p2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + data = { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "breakdown": "key", + "events": [{"id": "$pageview", "type": "events", "order": 0, "math": "weekly_active"}], + } + + filter = Filter(data=data) + result = ClickhouseTrends().run(filter, self.team,) + self.assertEqual(result[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 0.0]) + + @test_with_materialized_columns(event_properties=["key"], person_properties=["name"]) + def test_filter_test_accounts(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"name": "p2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + self.team.test_account_filters = [{"key": "name", "value": "p1", "operator": "is_not", "type": "person"}] + self.team.save() + filter = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [{"id": "$pageview", "type": "events", "order": 0}], + "filter_test_accounts": "true", + }, + team=self.team, + ) + result = ClickhouseTrends().run(filter, self.team,) + self.assertEqual(result[0]["count"], 1) + filter2 = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [{"id": "$pageview", "type": "events", "order": 0}], + }, + team=self.team, + ) + result = ClickhouseTrends().run(filter2, self.team,) + self.assertEqual(result[0]["count"], 2) + result = ClickhouseTrends().run(filter.with_data({"breakdown": "key"}), self.team,) + self.assertEqual(result[0]["count"], 1) + + @test_with_materialized_columns(["$some_property"]) + def test_breakdown_filtering_bar_chart_by_value(self): + self._create_events() + + # test breakdown filtering + with freeze_time("2020-01-04T13:01:01Z"): + response = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-7d", + "breakdown": "$some_property", + "events": [{"id": "sign up", "name": "sign up", "type": "events", "order": 0,},], + "display": TRENDS_BAR_VALUE, + } + ), + self.team, + ) + + self.assertEqual(response[0]["aggregated_value"], 2) # the events without breakdown value + self.assertEqual(response[1]["aggregated_value"], 1) + self.assertEqual(response[2]["aggregated_value"], 1) + self.assertEqual( + response[0]["days"], + [ + "2019-12-28", + "2019-12-29", + "2019-12-30", + "2019-12-31", + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-04", + ], + ) + + @test_with_materialized_columns(person_properties=["key", "key_2"], verify_no_jsonextract=False) + def test_breakdown_multiple_cohorts(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"key": "value"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"key_2": "value_2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + p3 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p3"], properties={"key_2": "value_2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p3", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + cohort1 = _create_cohort( + team=self.team, + name="cohort_1", + groups=[{"properties": [{"key": "key", "value": "value", "type": "person"}]}], + ) + cohort2 = _create_cohort( + team=self.team, + name="cohort_2", + groups=[{"properties": [{"key": "key_2", "value": "value_2", "type": "person"}]}], + ) + + cohort1.calculate_people() + cohort1.calculate_people_ch() + + cohort2.calculate_people() + cohort2.calculate_people_ch() + + with self.settings(USE_PRECALCULATED_CH_COHORT_PEOPLE=True): # Normally this is False in tests + with freeze_time("2020-01-04T13:01:01Z"): + res = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-7d", + "events": [{"id": "$pageview"}], + "properties": [], + "breakdown": [cohort1.pk, cohort2.pk], + "breakdown_type": "cohort", + } + ), + self.team, + ) + + self.assertEqual(res[0]["count"], 1) + self.assertEqual(res[1]["count"], 2) + + @test_with_materialized_columns(person_properties=["key", "key_2"], verify_no_jsonextract=False) + def test_breakdown_single_cohort(self): + p1 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p1"], properties={"key": "value"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p2"], properties={"key_2": "value_2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + p3 = Person.objects.create(team_id=self.team.pk, distinct_ids=["p3"], properties={"key_2": "value_2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p3", + timestamp="2020-01-02T12:00:00Z", + properties={"key": "val"}, + ) + + cohort1 = _create_cohort( + team=self.team, + name="cohort_1", + groups=[{"properties": [{"key": "key", "value": "value", "type": "person"}]}], + ) + + cohort1.calculate_people() + cohort1.calculate_people_ch() + + with self.settings(USE_PRECALCULATED_CH_COHORT_PEOPLE=True): # Normally this is False in tests + with freeze_time("2020-01-04T13:01:01Z"): + res = ClickhouseTrends().run( + Filter( + data={ + "date_from": "-7d", + "events": [{"id": "$pageview"}], + "properties": [], + "breakdown": cohort1.pk, + "breakdown_type": "cohort", + } + ), + self.team, + ) + + self.assertEqual(res[0]["count"], 1) + + @test_with_materialized_columns(["key", "$current_url"]) + def test_filtering_with_action_props(self): + _create_event( + event="sign up", + distinct_id="person1", + team=self.team, + properties={"key": "val", "$current_url": "/some/page"}, + ) + _create_event( + event="sign up", + distinct_id="person2", + team=self.team, + properties={"key": "val", "$current_url": "/some/page"}, + ) + _create_event( + event="sign up", + distinct_id="person3", + team=self.team, + properties={"key": "val", "$current_url": "/another/page"}, + ) + + action = Action.objects.create(name="sign up", team=self.team) + ActionStep.objects.create( + action=action, + event="sign up", + url="/some/page", + properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}], + ) + + response = ClickhouseTrends().run( + Filter(data={"date_from": "-14d", "actions": [{"id": action.pk, "type": "actions", "order": 0}],}), + self.team, + ) + + self.assertEqual(response[0]["count"], 2) + + def test_trends_math_without_math_property(self): + with self.assertRaises(ValidationError): + ClickhouseTrends().run( + Filter(data={"events": [{"id": "sign up", "math": "sum"}]}), self.team, + ) + + @snapshot_clickhouse_queries + def test_filtering_with_group_props(self): + self._create_groups() + + Person.objects.create(team_id=self.team.pk, distinct_ids=["person1"], properties={"key": "value"}) + _create_event( + event="$pageview", distinct_id="person1", team=self.team, timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:5"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:6"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:6", "$group_1": "company:10"}, + timestamp="2020-01-02T12:00:00Z", + ) + + filter = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [{"id": "$pageview", "type": "events", "order": 0}], + "properties": [ + {"key": "industry", "value": "finance", "type": "group", "group_type_index": 0}, + {"key": "key", "value": "value", "type": "person"}, + ], + }, + team=self.team, + ) + + response = ClickhouseTrends().run(filter, self.team) + self.assertEqual(response[0]["count"], 1) + + @snapshot_clickhouse_queries + def test_aggregating_by_group(self): + self._create_groups() + + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:5"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:6"}, + timestamp="2020-01-02T12:00:00Z", + ) + _create_event( + event="$pageview", + distinct_id="person1", + team=self.team, + properties={"$group_0": "org:6", "$group_1": "company:10"}, + timestamp="2020-01-02T12:00:00Z", + ) + + filter = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "math": "unique_group", + "math_group_type_index": 0, + } + ], + }, + team=self.team, + ) + + response = ClickhouseTrends().run(filter, self.team) + self.assertEqual(response[0]["count"], 2) + + filter = Filter( + { + "date_from": "2020-01-01T00:00:00Z", + "date_to": "2020-01-12T00:00:00Z", + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "math": "unique_group", + "math_group_type_index": 1, + } + ], + }, + team=self.team, + ) + response = ClickhouseTrends().run(filter, self.team) + self.assertEqual(response[0]["count"], 1) diff --git a/ee/clickhouse/queries/test/test_util.py b/ee/clickhouse/queries/test/test_util.py new file mode 100644 index 0000000000000..83d962bcca8cf --- /dev/null +++ b/ee/clickhouse/queries/test/test_util.py @@ -0,0 +1,43 @@ +from datetime import datetime +from uuid import uuid4 + +import pytz +from freezegun.api import freeze_time + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.breakdown_props import _parse_breakdown_cohorts +from ee.clickhouse.queries.util import get_earliest_timestamp +from posthog.models.action import Action +from posthog.models.action_step import ActionStep +from posthog.models.cohort import Cohort + + +def _create_event(**kwargs): + pk = uuid4() + kwargs.update({"event_uuid": pk}) + create_event(**kwargs) + + +@freeze_time("2021-01-21") +def test_get_earliest_timestamp(db, team): + _create_event(team=team, event="sign up", distinct_id="1", timestamp="2020-01-04T14:10:00Z") + _create_event(team=team, event="sign up", distinct_id="1", timestamp="2020-01-06T14:10:00Z") + + assert get_earliest_timestamp(team.id) == datetime(2020, 1, 4, 14, 10, tzinfo=pytz.UTC) + + +@freeze_time("2021-01-21") +def test_get_earliest_timestamp_with_no_events(db, team): + assert get_earliest_timestamp(team.id) == datetime(2021, 1, 14, tzinfo=pytz.UTC) + + +def test_parse_breakdown_cohort_query(db, team): + action = Action.objects.create(team=team, name="$pageview") + ActionStep.objects.create(action=action, event="$pageview") + cohort1 = Cohort.objects.create( + team=team, groups=[{"action_id": action.pk, "start_date": datetime(2020, 1, 8, 12, 0, 1)}], name="cohort1", + ) + queries, params = _parse_breakdown_cohorts([cohort1]) + assert len(queries) == 1 + sync_execute(queries[0], params) diff --git a/ee/clickhouse/queries/trends/breakdown.py b/ee/clickhouse/queries/trends/breakdown.py index 83424ad1cfa73..2849f2a61cdf8 100644 --- a/ee/clickhouse/queries/trends/breakdown.py +++ b/ee/clickhouse/queries/trends/breakdown.py @@ -1,221 +1,272 @@ -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from django.db.models.manager import BaseManager -from django.utils import timezone - -from ee.clickhouse.client import sync_execute from ee.clickhouse.models.action import format_action_filter -from ee.clickhouse.models.cohort import format_filter_query -from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.trends.util import parse_response, process_math -from ee.clickhouse.queries.util import get_interval_annotation_ch, get_time_diff, parse_timestamps -from ee.clickhouse.sql.events import NULL_BREAKDOWN_SQL, NULL_SQL -from ee.clickhouse.sql.person import GET_LATEST_PERSON_SQL +from ee.clickhouse.models.property import get_property_string_expr, parse_prop_clauses +from ee.clickhouse.models.util import PersonPropertiesMode +from ee.clickhouse.queries.breakdown_props import ( + ALL_USERS_COHORT_ID, + format_breakdown_cohort_join_query, + get_breakdown_cohort_name, + get_breakdown_prop_values, +) +from ee.clickhouse.queries.column_optimizer import ColumnOptimizer +from ee.clickhouse.queries.groups_join_query import GroupsJoinQuery +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from ee.clickhouse.queries.trends.util import enumerate_time_range, get_active_user_params, parse_response, process_math +from ee.clickhouse.queries.util import date_from_clause, get_time_diff, get_trunc_func_ch, parse_timestamps +from ee.clickhouse.sql.events import EVENT_JOIN_PERSON_SQL +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS from ee.clickhouse.sql.trends.breakdown import ( + BREAKDOWN_ACTIVE_USER_CONDITIONS_SQL, + BREAKDOWN_ACTIVE_USER_INNER_SQL, + BREAKDOWN_AGGREGATE_QUERY_SQL, BREAKDOWN_COHORT_JOIN_SQL, - BREAKDOWN_CONDITIONS_SQL, - BREAKDOWN_DEFAULT_SQL, - BREAKDOWN_PERSON_PROP_JOIN_SQL, + BREAKDOWN_INNER_SQL, BREAKDOWN_PROP_JOIN_SQL, BREAKDOWN_QUERY_SQL, ) -from ee.clickhouse.sql.trends.top_elements import TOP_ELEMENTS_ARRAY_OF_KEY_SQL -from ee.clickhouse.sql.trends.top_person_props import TOP_PERSON_PROPS_ARRAY_OF_KEY_SQL -from posthog.constants import TREND_FILTER_TYPE_ACTIONS -from posthog.models.action import Action -from posthog.models.cohort import Cohort +from posthog.constants import MONTHLY_ACTIVE, TREND_FILTER_TYPE_ACTIONS, TRENDS_DISPLAY_BY_VALUE, WEEKLY_ACTIVE from posthog.models.entity import Entity -from posthog.models.filter import Filter +from posthog.models.filters import Filter class ClickhouseTrendsBreakdown: - def _serialize_breakdown(self, entity: Entity, filter: Filter, team_id: int): - if isinstance(filter.breakdown, list) and "all" in filter.breakdown: - result = [] - filter.breakdown = filter.breakdown if filter.breakdown and isinstance(filter.breakdown, list) else [] - filter.breakdown.remove("all") - - # handle breakdown by all and by specific props separately - if filter.breakdown: - result.extend(self._format_breakdown_query(entity, filter, team_id)) - - filter.breakdown = ["all"] - all_result = self._format_breakdown_query(entity, filter, team_id) - - result.extend(all_result) - else: - result = self._format_breakdown_query(entity, filter, team_id) - return result + def __init__( + self, entity: Entity, filter: Filter, team_id: int, column_optimizer: Optional[ColumnOptimizer] = None + ): + self.entity = entity + self.filter = filter + self.team_id = team_id + self.params: Dict[str, Any] = {"team_id": team_id} + self.column_optimizer = column_optimizer or ColumnOptimizer(self.filter, self.team_id) - def _format_breakdown_query(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]: - - # process params - params: Dict[str, Any] = {"team_id": team_id} - interval_annotation = get_interval_annotation_ch(filter.interval) - num_intervals, seconds_in_interval = get_time_diff(filter.interval or "day", filter.date_from, filter.date_to) - parsed_date_from, parsed_date_to = parse_timestamps(filter=filter) + def get_query(self) -> Tuple[str, Dict, Callable]: + interval_annotation = get_trunc_func_ch(self.filter.interval) + num_intervals, seconds_in_interval, round_interval = get_time_diff( + self.filter.interval, self.filter.date_from, self.filter.date_to, self.team_id + ) + _, parsed_date_to, date_params = parse_timestamps(filter=self.filter, team_id=self.team_id) - props_to_filter = [*filter.properties, *entity.properties] - prop_filters, prop_filter_params = parse_prop_clauses(props_to_filter, team_id) - aggregate_operation, join_condition, math_params = process_math(entity) + props_to_filter = [*self.filter.properties, *self.entity.properties] + prop_filters, prop_filter_params = parse_prop_clauses( + props_to_filter, self.team_id, table_name="e", person_properties_mode=PersonPropertiesMode.EXCLUDE, + ) + aggregate_operation, _, math_params = process_math(self.entity) action_query = "" action_params: Dict = {} - if entity.type == TREND_FILTER_TYPE_ACTIONS: - action = Action.objects.get(pk=entity.id) - action_query, action_params = format_action_filter(action) - - null_sql = NULL_BREAKDOWN_SQL.format( - interval=interval_annotation, - seconds_in_interval=seconds_in_interval, - num_intervals=num_intervals, - date_to=(filter.date_to).strftime("%Y-%m-%d %H:%M:%S"), - ) + if self.entity.type == TREND_FILTER_TYPE_ACTIONS: + action = self.entity.get_action() + action_query, action_params = format_action_filter(action, table_name="e") - params = { - **params, + self.params = { + **self.params, **math_params, **prop_filter_params, **action_params, - "event": entity.id, - "key": filter.breakdown, + "event": self.entity.id, + "key": self.filter.breakdown, + **date_params, } - top_elements_array = [] breakdown_filter_params = { - "parsed_date_from": parsed_date_from, + "parsed_date_from": date_from_clause(interval_annotation, round_interval), "parsed_date_to": parsed_date_to, "actions_query": "AND {}".format(action_query) if action_query else "", "event_filter": "AND event = %(event)s" if not action_query else "", "filters": prop_filters if props_to_filter else "", } - if filter.breakdown_type == "cohort": - breakdown = filter.breakdown if filter.breakdown and isinstance(filter.breakdown, list) else [] - if "all" in breakdown: - null_sql = NULL_SQL - breakdown_filter = BREAKDOWN_CONDITIONS_SQL - breakdown_query = BREAKDOWN_DEFAULT_SQL - else: - cohort_queries, cohort_ids, cohort_params = self._format_breakdown_cohort_join_query(breakdown, team_id) - params = {**params, "values": cohort_ids, **cohort_params} - breakdown_filter = BREAKDOWN_COHORT_JOIN_SQL - breakdown_filter_params = {**breakdown_filter_params, "cohort_queries": cohort_queries} - breakdown_query = BREAKDOWN_QUERY_SQL - elif filter.breakdown_type == "person": - elements_query = TOP_PERSON_PROPS_ARRAY_OF_KEY_SQL.format( - parsed_date_from=parsed_date_from, - parsed_date_to=parsed_date_to, - latest_person_sql=GET_LATEST_PERSON_SQL.format(query=""), + _params, _breakdown_filter_params = {}, {} + + if self.filter.breakdown_type == "cohort": + _params, breakdown_filter, _breakdown_filter_params, breakdown_value = self._breakdown_cohort_params() + else: + _params, breakdown_filter, _breakdown_filter_params, breakdown_value = self._breakdown_prop_params( + "count(*)" if self.entity.math == "dau" else aggregate_operation, math_params, ) - top_elements_array = self._get_top_elements(elements_query, filter, team_id) - params = { - **params, - "values": top_elements_array, - } - breakdown_filter = BREAKDOWN_PERSON_PROP_JOIN_SQL - breakdown_filter_params = { - **breakdown_filter_params, - "latest_person_sql": GET_LATEST_PERSON_SQL.format(query=""), - } - breakdown_query = BREAKDOWN_QUERY_SQL + + if len(_params["values"]) == 0: + # If there are no breakdown values, we are sure that there's no relevant events, so instead of adjusting + # a "real" SELECT for this, we only include the below dummy SELECT. + # It's a drop-in replacement for a "real" one, simply always returning 0 rows. + # See https://github.com/PostHog/posthog/pull/5674 for context. + return ( + "SELECT [now()] AS date, [0] AS data, '' AS breakdown_value LIMIT 0", + {}, + lambda _: [], + ) + + person_join_condition, person_join_params = self._person_join_condition() + groups_join_condition, groups_join_params = GroupsJoinQuery( + self.filter, self.team_id, self.column_optimizer + ).get_join_query() + self.params = {**self.params, **_params, **person_join_params, **groups_join_params} + breakdown_filter_params = {**breakdown_filter_params, **_breakdown_filter_params} + + if self.filter.display in TRENDS_DISPLAY_BY_VALUE: + breakdown_filter = breakdown_filter.format(**breakdown_filter_params) + content_sql = BREAKDOWN_AGGREGATE_QUERY_SQL.format( + breakdown_filter=breakdown_filter, + person_join=person_join_condition, + groups_join=groups_join_condition, + aggregate_operation=aggregate_operation, + breakdown_value=breakdown_value, + ) + time_range = enumerate_time_range(self.filter, seconds_in_interval) + + return ( + content_sql, + self.params, + self._parse_single_aggregate_result(self.filter, self.entity, {"days": time_range}), + ) + else: - elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format( - parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to + breakdown_filter = breakdown_filter.format(**breakdown_filter_params) + + if self.entity.math in [WEEKLY_ACTIVE, MONTHLY_ACTIVE]: + active_user_params = get_active_user_params(self.filter, self.entity, self.team_id) + conditions = BREAKDOWN_ACTIVE_USER_CONDITIONS_SQL.format( + **breakdown_filter_params, **active_user_params + ) + inner_sql = BREAKDOWN_ACTIVE_USER_INNER_SQL.format( + breakdown_filter=breakdown_filter, + person_join=person_join_condition, + groups_join=groups_join_condition, + aggregate_operation=aggregate_operation, + interval_annotation=interval_annotation, + breakdown_value=breakdown_value, + conditions=conditions, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + **active_user_params, + **breakdown_filter_params, + ) + else: + inner_sql = BREAKDOWN_INNER_SQL.format( + breakdown_filter=breakdown_filter, + person_join=person_join_condition, + groups_join=groups_join_condition, + aggregate_operation=aggregate_operation, + interval_annotation=interval_annotation, + breakdown_value=breakdown_value, + ) + + breakdown_query = BREAKDOWN_QUERY_SQL.format( + interval=interval_annotation, num_intervals=num_intervals, inner_sql=inner_sql, ) + self.params.update( + {"seconds_in_interval": seconds_in_interval, "num_intervals": num_intervals,} + ) + + return breakdown_query, self.params, self._parse_trend_result(self.filter, self.entity) - top_elements_array = self._get_top_elements(elements_query, filter, team_id) - params = { - **params, - "values": top_elements_array, - } - breakdown_filter = BREAKDOWN_PROP_JOIN_SQL - breakdown_query = BREAKDOWN_QUERY_SQL - - null_sql = null_sql.format( - interval=interval_annotation, - seconds_in_interval=seconds_in_interval, - num_intervals=num_intervals, - date_to=(filter.date_to).strftime("%Y-%m-%d %H:%M:%S"), + def _breakdown_cohort_params(self): + cohort_queries, cohort_ids, cohort_params = format_breakdown_cohort_join_query( + self.team_id, self.filter, entity=self.entity ) - breakdown_filter = breakdown_filter.format(**breakdown_filter_params) - breakdown_query = breakdown_query.format( - null_sql=null_sql, - breakdown_filter=breakdown_filter, - event_join=join_condition, - aggregate_operation=aggregate_operation, - interval_annotation=interval_annotation, + params = {"values": cohort_ids, **cohort_params} + breakdown_filter = BREAKDOWN_COHORT_JOIN_SQL + breakdown_filter_params = {"cohort_queries": cohort_queries} + + return params, breakdown_filter, breakdown_filter_params, "value" + + def _breakdown_prop_params(self, aggregate_operation: str, math_params: Dict): + values_arr = get_breakdown_prop_values( + self.filter, + self.entity, + aggregate_operation, + self.team_id, + extra_params=math_params, + column_optimizer=self.column_optimizer, ) - try: - result = sync_execute(breakdown_query, params) - except: - result = [] + # :TRICKY: We only support string breakdown for event/person properties + assert isinstance(self.filter.breakdown, str) + + if self.filter.breakdown_type == "person": + breakdown_value, _ = get_property_string_expr("person", self.filter.breakdown, "%(key)s", "person_props") + elif self.filter.breakdown_type == "group": + properties_field = f"group_properties_{self.filter.breakdown_group_type_index}" + breakdown_value, _ = get_property_string_expr("groups", self.filter.breakdown, "%(key)s", properties_field) + else: + breakdown_value, _ = get_property_string_expr("events", self.filter.breakdown, "%(key)s", "properties") + + return ( + {"values": values_arr}, + BREAKDOWN_PROP_JOIN_SQL, + {"breakdown_value_expr": breakdown_value}, + breakdown_value, + ) - parsed_results = [] + def _parse_single_aggregate_result( + self, filter: Filter, entity: Entity, additional_values: Dict[str, Any] + ) -> Callable: + def _parse(result: List) -> List: + parsed_results = [] + for idx, stats in enumerate(result): + result_descriptors = self._breakdown_result_descriptors(stats[1], filter, entity) + parsed_result = {"aggregated_value": stats[0], **result_descriptors, **additional_values} + parsed_results.append(parsed_result) - for idx, stats in enumerate(result): + return parsed_results - breakdown_value = stats[2] if not filter.breakdown_type == "cohort" else "" - stripped_value = breakdown_value.strip('"') if isinstance(breakdown_value, str) else breakdown_value + return _parse - extra_label = self._determine_breakdown_label(idx, filter.breakdown_type, filter.breakdown, stripped_value) - label = "{} - {}".format(entity.name, extra_label) - additional_values = { - "label": label, - "breakdown_value": filter.breakdown[idx] - if isinstance(filter.breakdown, list) - else filter.breakdown - if filter.breakdown_type == "cohort" - else stripped_value, - } - parsed_result = parse_response(stats, filter, additional_values) - parsed_results.append(parsed_result) + def _parse_trend_result(self, filter: Filter, entity: Entity) -> Callable: + def _parse(result: List) -> List: + parsed_results = [] + for idx, stats in enumerate(result): + result_descriptors = self._breakdown_result_descriptors(stats[2], filter, entity) + parsed_result = parse_response(stats, filter, result_descriptors) + parsed_results.append(parsed_result) - return parsed_results + return sorted(parsed_results, key=lambda x: 0 if x.get("breakdown_value") != "all" else 1) + + return _parse + + def _breakdown_result_descriptors(self, breakdown_value, filter: Filter, entity: Entity): + extra_label = self._determine_breakdown_label( + breakdown_value, filter.breakdown_type, filter.breakdown, breakdown_value + ) + label = "{} - {}".format(entity.name, extra_label) + additional_values = { + "label": label, + } + if filter.breakdown_type == "cohort": + additional_values["breakdown_value"] = "all" if breakdown_value == ALL_USERS_COHORT_ID else breakdown_value + else: + additional_values["breakdown_value"] = breakdown_value + + return additional_values def _determine_breakdown_label( self, - index: int, + breakdown_value: int, breakdown_type: Optional[str], breakdown: Union[str, List[Union[str, int]], None], value: Union[str, int], ) -> str: breakdown = breakdown if breakdown and isinstance(breakdown, list) else [] if breakdown_type == "cohort": - if breakdown[index] == "all": - return "all users" - else: - return Cohort.objects.get(pk=breakdown[index]).name + return get_breakdown_cohort_name(breakdown_value) else: - return str(value) or "" - - def _get_top_elements(self, query: str, filter: Filter, team_id: int) -> List: - element_params = {"key": filter.breakdown, "limit": 20, "team_id": team_id} - - try: - top_elements_array_result = sync_execute(query, element_params) - top_elements_array = top_elements_array_result[0][0] - except: - top_elements_array = [] - - return top_elements_array - - def _format_breakdown_cohort_join_query(self, breakdown: List[Any], team_id: int) -> Tuple[str, List, Dict]: - cohorts = Cohort.objects.filter(team_id=team_id, pk__in=[b for b in breakdown if b != "all"]) - cohort_queries, params = self._parse_breakdown_cohorts(cohorts) - ids = [cohort.pk for cohort in cohorts] - return cohort_queries, ids, params - - def _parse_breakdown_cohorts(self, cohorts: BaseManager) -> Tuple[str, Dict]: - queries = [] - params: Dict[str, Any] = {} - for cohort in cohorts: - person_id_query, cohort_filter_params = format_filter_query(cohort) - params = {**params, **cohort_filter_params} - cohort_query = person_id_query.replace( - "SELECT distinct_id", "SELECT distinct_id, {} as value".format(cohort.pk) + return str(value) or "none" + + def _person_join_condition(self) -> Tuple[str, Dict]: + person_query = ClickhousePersonQuery(self.filter, self.team_id, self.column_optimizer, entity=self.entity) + if person_query.is_used: + query, params = person_query.get_query() + return ( + f""" + {EVENT_JOIN_PERSON_SQL} + INNER JOIN ({query}) person + ON person.id = pdi.person_id + """, + params, ) - queries.append(cohort_query) - return " UNION ALL ".join(queries), params + elif self.entity.math == "dau": + # Only join distinct_ids + return EVENT_JOIN_PERSON_SQL, {} + else: + return "", {} diff --git a/ee/clickhouse/queries/trends/clickhouse_trends.py b/ee/clickhouse/queries/trends/clickhouse_trends.py index 46fa0263db813..1cabbadadbc8b 100644 --- a/ee/clickhouse/queries/trends/clickhouse_trends.py +++ b/ee/clickhouse/queries/trends/clickhouse_trends.py @@ -1,15 +1,83 @@ +from typing import Any, Callable, Dict, List, Tuple + +from django.conf import settings +from django.db.models.query import Prefetch from django.utils import timezone +from sentry_sdk.api import capture_exception +from ee.clickhouse.client import sync_execute from ee.clickhouse.queries.trends.breakdown import ClickhouseTrendsBreakdown -from ee.clickhouse.queries.trends.normal import ClickhouseTrendsNormal -from posthog.models.filter import Filter +from ee.clickhouse.queries.trends.formula import ClickhouseTrendsFormula +from ee.clickhouse.queries.trends.lifecycle import ClickhouseLifecycle +from ee.clickhouse.queries.trends.total_volume import ClickhouseTrendsTotalVolume +from posthog.constants import TREND_FILTER_TYPE_ACTIONS, TRENDS_CUMULATIVE, TRENDS_LIFECYCLE +from posthog.models.action import Action +from posthog.models.action_step import ActionStep +from posthog.models.entity import Entity +from posthog.models.filters import Filter +from posthog.models.team import Team +from posthog.queries.base import handle_compare from posthog.queries.trends import Trends from posthog.utils import relative_date_parse -class ClickhouseTrends(ClickhouseTrendsNormal, ClickhouseTrendsBreakdown, Trends): - def _set_default_dates(self, filter: Filter, team_id: int) -> None: +class ClickhouseTrends(ClickhouseTrendsTotalVolume, ClickhouseLifecycle, ClickhouseTrendsFormula, Trends): + def _set_default_dates(self, filter: Filter, team_id: int) -> Filter: + data = {} if not filter._date_from: - filter._date_from = relative_date_parse("-7d") + data.update({"date_from": relative_date_parse("-7d")}) if not filter._date_to: - filter._date_to = timezone.now() + data.update({"date_to": timezone.now()}) + if data: + return Filter(data={**filter._data, **data}) + return filter + + def _get_sql_for_entity(self, filter: Filter, entity: Entity, team_id: int) -> Tuple[str, Dict, Callable]: + if filter.breakdown: + sql, params, parse_function = ClickhouseTrendsBreakdown(entity, filter, team_id).get_query() + elif filter.shown_as == TRENDS_LIFECYCLE: + sql, params, parse_function = self._format_lifecycle_query(entity, filter, team_id) + else: + sql, params, parse_function = self._total_volume_query(entity, filter, team_id) + + return sql, params, parse_function + + def _run_query(self, filter: Filter, entity: Entity, team_id: int) -> List[Dict[str, Any]]: + sql, params, parse_function = self._get_sql_for_entity(filter, entity, team_id) + try: + result = sync_execute(sql, params) + except Exception as e: + capture_exception(e) + if settings.TEST or settings.DEBUG: + raise e + result = [] + + result = parse_function(result) + serialized_data = self._format_serialized(entity, result) + + if filter.display == TRENDS_CUMULATIVE: + serialized_data = self._handle_cumulative(serialized_data) + return serialized_data + + def run(self, filter: Filter, team: Team, *args, **kwargs) -> List[Dict[str, Any]]: + actions = Action.objects.filter(team_id=team.pk).order_by("-id") + if len(filter.actions) > 0: + actions = Action.objects.filter(pk__in=[entity.id for entity in filter.actions], team_id=team.pk) + actions = actions.prefetch_related(Prefetch("steps", queryset=ActionStep.objects.order_by("id"))) + + filter = self._set_default_dates(filter, team.pk) + + if filter.formula: + return handle_compare(filter, self._run_formula_query, team) + + result = [] + for entity in filter.entities: + if entity.type == TREND_FILTER_TYPE_ACTIONS: + try: + entity.name = actions.get(id=entity.id).name + except Action.DoesNotExist: + continue + entities_list = handle_compare(filter, self._run_query, team, entity=entity) + result.extend(entities_list) + + return result diff --git a/ee/clickhouse/queries/trends/formula.py b/ee/clickhouse/queries/trends/formula.py new file mode 100644 index 0000000000000..bc14038801640 --- /dev/null +++ b/ee/clickhouse/queries/trends/formula.py @@ -0,0 +1,92 @@ +import math +from itertools import accumulate +from typing import Any, Dict, List + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.queries.breakdown_props import get_breakdown_cohort_name +from ee.clickhouse.queries.trends.util import parse_response +from posthog.constants import TRENDS_CUMULATIVE, TRENDS_DISPLAY_BY_VALUE +from posthog.models.cohort import Cohort +from posthog.models.filters.filter import Filter + + +class ClickhouseTrendsFormula: + def _run_formula_query(self, filter: Filter, team_id: int): + letters = [chr(65 + i) for i in range(0, len(filter.entities))] + queries = [] + params: Dict[str, Any] = {} + for idx, entity in enumerate(filter.entities): + sql, entity_params, _ = self._get_sql_for_entity(filter, entity, team_id) # type: ignore + sql = sql.replace("%(", f"%({idx}_") + entity_params = {f"{idx}_{key}": value for key, value in entity_params.items()} + queries.append(sql) + params = {**params, **entity_params} + + breakdown_value = ( + ", sub_A.breakdown_value" + if filter.breakdown_type == "cohort" + else ", trim(BOTH '\"' FROM sub_A.breakdown_value)" + ) + is_aggregate = filter.display in TRENDS_DISPLAY_BY_VALUE + + sql = """SELECT + {date_select} + arrayMap(({letters_select}) -> {formula}, {selects}) + {breakdown_value} + {max_length} + FROM ({first_query}) as sub_A + {queries} + """.format( + date_select="'' as date," if is_aggregate else "sub_A.date,", + letters_select=", ".join(letters), + formula=filter.formula, # formula is properly escaped in the filter + # Need to wrap aggregates in arrays so we can still use arrayMap + selects=", ".join( + [ + (f"[sub_{letter}.data]" if is_aggregate else f"arrayResize(sub_{letter}.data, max_length, 0)") + for letter in letters + ] + ), + breakdown_value=breakdown_value if filter.breakdown else "", + max_length="" + if is_aggregate + else ", arrayMax([{}]) as max_length".format(", ".join(f"length(sub_{letter}.data)" for letter in letters)), + first_query=queries[0], + queries="".join( + [ + "FULL OUTER JOIN ({query}) as sub_{letter} ON sub_A.breakdown_value = sub_{letter}.breakdown_value ".format( + query=query, letter=letters[i + 1] + ) + for i, query in enumerate(queries[1:]) + ] + ) + if filter.breakdown + else "".join( + [" CROSS JOIN ({}) as sub_{}".format(query, letters[i + 1]) for i, query in enumerate(queries[1:])] + ), + ) + result = sync_execute(sql, params) + response = [] + for item in result: + additional_values: Dict[str, Any] = { + "label": self._label(filter, item, team_id), + } + if is_aggregate: + additional_values["data"] = [] + additional_values["aggregated_value"] = item[1][0] + else: + additional_values["data"] = [ + round(number, 2) if not math.isnan(number) and not math.isinf(number) else 0.0 for number in item[1] + ] + if filter.display == TRENDS_CUMULATIVE: + additional_values["data"] = list(accumulate(additional_values["data"])) + additional_values["count"] = float(sum(additional_values["data"])) + response.append(parse_response(item, filter, additional_values)) + return response + + def _label(self, filter: Filter, item: List, team_id: int) -> str: + if filter.breakdown: + if filter.breakdown_type == "cohort": + return get_breakdown_cohort_name(item[2]) + return item[2] + return "Formula ({})".format(filter.formula) diff --git a/ee/clickhouse/queries/trends/lifecycle.py b/ee/clickhouse/queries/trends/lifecycle.py new file mode 100644 index 0000000000000..c7d386760471d --- /dev/null +++ b/ee/clickhouse/queries/trends/lifecycle.py @@ -0,0 +1,179 @@ +from datetime import datetime, timedelta +from typing import Any, Callable, Dict, List, Tuple, Union + +from dateutil.relativedelta import relativedelta +from django.db.models.query import Prefetch +from rest_framework.exceptions import ValidationError +from rest_framework.request import Request + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.action import format_action_filter +from ee.clickhouse.models.person import get_persons_by_uuids +from ee.clickhouse.models.property import parse_prop_clauses +from ee.clickhouse.queries.trends.util import parse_response +from ee.clickhouse.queries.util import get_earliest_timestamp, get_time_diff, get_trunc_func_ch, parse_timestamps +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from ee.clickhouse.sql.trends.lifecycle import LIFECYCLE_PEOPLE_SQL, LIFECYCLE_SQL +from posthog.constants import TREND_FILTER_TYPE_ACTIONS +from posthog.models.entity import Entity +from posthog.models.filters import Filter +from posthog.queries.lifecycle import LifecycleTrend + + +class ClickhouseLifecycle(LifecycleTrend): + def get_interval(self, interval: str) -> Tuple[Union[timedelta, relativedelta], str, str]: + if interval == "hour": + return timedelta(hours=1), "1 HOUR", "1 MINUTE" + elif interval == "minute": + return timedelta(minutes=1), "1 MINUTE", "1 SECOND" + elif interval == "day": + return timedelta(days=1), "1 DAY", "1 HOUR" + elif interval == "week": + return timedelta(weeks=1), "1 WEEK", "1 DAY" + elif interval == "month": + return relativedelta(months=1), "1 MONTH", "1 DAY" + else: + raise ValidationError("{interval} not supported") + + def _format_lifecycle_query(self, entity: Entity, filter: Filter, team_id: int) -> Tuple[str, Dict, Callable]: + date_from = filter.date_from + + if not date_from: + date_from = get_earliest_timestamp(team_id) + + interval = filter.interval + num_intervals, seconds_in_interval, _ = get_time_diff(interval, filter.date_from, filter.date_to, team_id) + interval_increment, interval_string, sub_interval_string = self.get_interval(interval) + trunc_func = get_trunc_func_ch(interval) + event_query = "" + event_params: Dict[str, Any] = {} + + props_to_filter = [*filter.properties, *entity.properties] + prop_filters, prop_filter_params = parse_prop_clauses(props_to_filter, team_id) + + _, _, date_params = parse_timestamps(filter=filter, team_id=team_id) + + if entity.type == TREND_FILTER_TYPE_ACTIONS: + try: + action = entity.get_action() + event_query, event_params = format_action_filter(action) + except: + return "", {}, self._parse_result(filter, entity) + else: + event_query = "event = %(event)s" + event_params = {"event": entity.id} + + return ( + LIFECYCLE_SQL.format( + interval=interval_string, + trunc_func=trunc_func, + event_query=event_query, + filters=prop_filters, + sub_interval=sub_interval_string, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ), + { + "team_id": team_id, + "prev_date_from": (date_from - interval_increment).strftime( + "%Y-%m-%d{}".format( + " %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else " 00:00:00" + ) + ), + "num_intervals": num_intervals, + "seconds_in_interval": seconds_in_interval, + **event_params, + **date_params, + **prop_filter_params, + }, + self._parse_result(filter, entity), + ) + + def _parse_result(self, filter: Filter, entity: Entity) -> Callable: + def _parse(result: List) -> List: + res = [] + for val in result: + label = "{} - {}".format(entity.name, val[2]) + additional_values = {"label": label, "status": val[2]} + parsed_result = parse_response(val, filter, additional_values) + res.append(parsed_result) + + return res + + return _parse + + def get_people( + self, + filter: Filter, + team_id: int, + target_date: datetime, + lifecycle_type: str, + request: Request, + limit: int = 100, + ): + entity = filter.entities[0] + date_from = filter.date_from + + if not date_from: + date_from = get_earliest_timestamp(team_id) + + interval = filter.interval + num_intervals, seconds_in_interval, _ = get_time_diff( + interval, filter.date_from, filter.date_to, team_id=team_id + ) + interval_increment, interval_string, sub_interval_string = self.get_interval(interval) + trunc_func = get_trunc_func_ch(interval) + event_query = "" + event_params: Dict[str, Any] = {} + + _, _, date_params = parse_timestamps(filter=filter, team_id=team_id) + + if entity.type == TREND_FILTER_TYPE_ACTIONS: + try: + action = entity.get_action() + event_query, event_params = format_action_filter(action) + except: + return [] + else: + event_query = "event = %(event)s" + event_params = {"event": entity.id} + + props_to_filter = [*filter.properties, *entity.properties] + prop_filters, prop_filter_params = parse_prop_clauses(props_to_filter, team_id) + + result = sync_execute( + LIFECYCLE_PEOPLE_SQL.format( + interval=interval_string, + trunc_func=trunc_func, + event_query=event_query, + filters=prop_filters, + sub_interval=sub_interval_string, + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, + ), + { + "team_id": team_id, + "prev_date_from": (date_from - interval_increment).strftime( + "%Y-%m-%d{}".format( + " %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else " 00:00:00" + ) + ), + "num_intervals": num_intervals, + "seconds_in_interval": seconds_in_interval, + **event_params, + **date_params, + **prop_filter_params, + "status": lifecycle_type, + "target_date": target_date.strftime( + "%Y-%m-%d{}".format( + " %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else " 00:00:00" + ) + ), + "offset": filter.offset, + "limit": limit, + }, + ) + people = get_persons_by_uuids(team_id=team_id, uuids=[p[0] for p in result]) + people = people.prefetch_related(Prefetch("persondistinctid_set", to_attr="distinct_ids_cache")) + + from posthog.api.person import PersonSerializer + + return PersonSerializer(people, many=True).data diff --git a/ee/clickhouse/queries/trends/normal.py b/ee/clickhouse/queries/trends/normal.py deleted file mode 100644 index a1c1a25864f34..0000000000000 --- a/ee/clickhouse/queries/trends/normal.py +++ /dev/null @@ -1,76 +0,0 @@ -from typing import Any, Dict, List - -from django.utils import timezone - -from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.action import format_action_filter -from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.trends.util import parse_response, process_math -from ee.clickhouse.queries.util import get_interval_annotation_ch, get_time_diff, parse_timestamps -from ee.clickhouse.sql.events import NULL_SQL -from ee.clickhouse.sql.trends.aggregate import AGGREGATE_SQL -from ee.clickhouse.sql.trends.volume import VOLUME_ACTIONS_SQL, VOLUME_SQL -from posthog.constants import TREND_FILTER_TYPE_ACTIONS -from posthog.models.action import Action -from posthog.models.entity import Entity -from posthog.models.filter import Filter - - -class ClickhouseTrendsNormal: - def _format_normal_query(self, entity: Entity, filter: Filter, team_id: int) -> List[Dict[str, Any]]: - - interval_annotation = get_interval_annotation_ch(filter.interval) - num_intervals, seconds_in_interval = get_time_diff(filter.interval or "day", filter.date_from, filter.date_to) - parsed_date_from, parsed_date_to = parse_timestamps(filter=filter) - - props_to_filter = [*filter.properties, *entity.properties] - prop_filters, prop_filter_params = parse_prop_clauses(props_to_filter, team_id) - - aggregate_operation, join_condition, math_params = process_math(entity) - - params: Dict = {"team_id": team_id} - params = {**params, **prop_filter_params, **math_params} - content_sql_params = { - "interval": interval_annotation, - "timestamp": "timestamp", - "team_id": team_id, - "parsed_date_from": parsed_date_from, - "parsed_date_to": parsed_date_to, - "filters": prop_filters, - "event_join": join_condition, - "aggregate_operation": aggregate_operation, - } - - if entity.type == TREND_FILTER_TYPE_ACTIONS: - try: - action = Action.objects.get(pk=entity.id) - action_query, action_params = format_action_filter(action) - params = {**params, **action_params} - content_sql = VOLUME_ACTIONS_SQL - content_sql_params = {**content_sql_params, "actions_query": action_query} - except: - return [] - else: - content_sql = VOLUME_SQL - params = {**params, "event": entity.id} - null_sql = NULL_SQL.format( - interval=interval_annotation, - seconds_in_interval=seconds_in_interval, - num_intervals=num_intervals, - date_to=filter.date_to.strftime("%Y-%m-%d %H:%M:%S"), - ) - content_sql = content_sql.format(**content_sql_params) - final_query = AGGREGATE_SQL.format(null_sql=null_sql, content_sql=content_sql) - - try: - result = sync_execute(final_query, params) - - except: - result = [] - - parsed_results = [] - for _, stats in enumerate(result): - parsed_result = parse_response(stats, filter) - parsed_results.append(parsed_result) - - return parsed_results diff --git a/ee/clickhouse/queries/trends/person.py b/ee/clickhouse/queries/trends/person.py new file mode 100644 index 0000000000000..1cd900a90c897 --- /dev/null +++ b/ee/clickhouse/queries/trends/person.py @@ -0,0 +1,83 @@ +from datetime import timedelta +from typing import Dict, Tuple + +from dateutil.relativedelta import relativedelta +from django.utils import timezone +from rest_framework.utils.serializer_helpers import ReturnDict + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.person import ClickhousePersonSerializer +from ee.clickhouse.queries.trends.trend_event_query import TrendsEventQuery +from ee.clickhouse.sql.person import GET_PERSONS_FROM_EVENT_QUERY +from posthog.constants import TRENDS_CUMULATIVE, TRENDS_DISPLAY_BY_VALUE +from posthog.models.cohort import Cohort +from posthog.models.entity import Entity +from posthog.models.filters import Filter +from posthog.models.property import Property +from posthog.models.team import Team + + +def _handle_date_interval(filter: Filter) -> Filter: + # adhoc date handling. parsed differently with django orm + date_from = filter.date_from or timezone.now() + data: Dict = {} + if filter.interval == "month": + data.update( + {"date_to": (date_from + relativedelta(months=1) - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")} + ) + elif filter.interval == "week": + data.update({"date_to": (date_from + relativedelta(weeks=1) - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")}) + elif filter.interval == "day": + data.update({"date_to": date_from}) + elif filter.interval == "hour": + data.update({"date_to": date_from + timedelta(hours=1)}) + elif filter.interval == "minute": + data.update({"date_to": date_from + timedelta(minutes=1)}) + return filter.with_data(data) + + +class TrendsPersonQuery: + def __init__(self, team: Team, entity: Entity, filter: Filter): + self.team = team + self.entity = entity + self.filter = filter + + if self.filter.display != TRENDS_CUMULATIVE and not self.filter.display in TRENDS_DISPLAY_BY_VALUE: + self.filter = _handle_date_interval(self.filter) + + def get_query(self) -> Tuple[str, Dict]: + if self.filter.breakdown_type == "cohort" and self.filter.breakdown_value != "all": + cohort = Cohort.objects.get(pk=self.filter.breakdown_value, team_id=self.team.pk) + self.filter = self.filter.with_data( + {"properties": self.filter.properties + [Property(key="id", value=cohort.pk, type="cohort")]} + ) + elif ( + self.filter.breakdown_type + and isinstance(self.filter.breakdown, str) + and isinstance(self.filter.breakdown_value, str) + ): + breakdown_prop = Property( + key=self.filter.breakdown, value=self.filter.breakdown_value, type=self.filter.breakdown_type + ) + self.filter = self.filter.with_data({"properties": self.filter.properties + [breakdown_prop]}) + + events_query, params = TrendsEventQuery( + filter=self.filter, + team_id=self.team.pk, + entity=self.entity, + should_join_distinct_ids=True, + should_join_persons=True, + extra_fields=["distinct_id", "team_id"], + extra_person_fields=["created_at", "person_props", "is_identified"], + ).get_query() + + return ( + GET_PERSONS_FROM_EVENT_QUERY.format(events_query=events_query), + {**params, "offset": self.filter.offset, "limit": 200}, + ) + + def get_people(self) -> ReturnDict: + query, params = self.get_query() + people = sync_execute(query, params) + + return ClickhousePersonSerializer(people, many=True).data diff --git a/ee/clickhouse/queries/trends/test/test_formula.py b/ee/clickhouse/queries/trends/test/test_formula.py new file mode 100644 index 0000000000000..b6c401debbb16 --- /dev/null +++ b/ee/clickhouse/queries/trends/test/test_formula.py @@ -0,0 +1,329 @@ +from typing import Dict, Optional +from uuid import uuid4 + +from freezegun.api import freeze_time + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.queries.trends.clickhouse_trends import ClickhouseTrends +from posthog.constants import TRENDS_CUMULATIVE, TRENDS_PIE +from posthog.models import Cohort, Person +from posthog.models.filters.filter import Filter +from posthog.queries.abstract_test.test_interval import AbstractIntervalTest +from posthog.test.base import APIBaseTest + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFormula(AbstractIntervalTest, APIBaseTest): + CLASS_DATA_LEVEL_SETUP = False + + def setUp(self): + super().setUp() # type: ignore + + Person.objects.create( + team_id=self.team.pk, distinct_ids=["blabla", "anonymous_id"], properties={"$some_prop": "some_val"} + ) + with freeze_time("2020-01-02T13:01:01Z"): + _create_event( + team=self.team, + event="session start", + distinct_id="blabla", + properties={"session duration": 200, "location": "Paris", "$current_url": "http://example.org"}, + ) + _create_event( + team=self.team, + event="session start", + distinct_id="blabla", + properties={"session duration": 300, "location": "Paris"}, + ) + _create_event( + team=self.team, + event="session start", + distinct_id="blabla", + properties={"session duration": 400, "location": "London"}, + ) + with freeze_time("2020-01-03T13:01:01Z"): + _create_event( + team=self.team, + event="session start", + distinct_id="blabla", + properties={"session duration": 400, "location": "London"}, + ) + with freeze_time("2020-01-03T13:04:01Z"): + _create_event( + team=self.team, + event="session start", + distinct_id="blabla", + properties={"session duration": 500, "location": "London"}, + ) + _create_event( + team=self.team, + event="session end", + distinct_id="blabla", + properties={"session duration": 500, "location": "London"}, + ) + + def _run(self, extra: Dict = {}, run_at: Optional[str] = None): + with freeze_time(run_at or "2020-01-04T13:01:01Z"): + action_response = ClickhouseTrends().run( + Filter( + data={ + "events": [ + {"id": "session start", "math": "sum", "math_property": "session duration"}, + {"id": "session start", "math": "avg", "math_property": "session duration"}, + ], + "formula": "A + B", + **extra, + } + ), + self.team, + ) + return action_response + + def test_minute_interval(self): + data = self._run({"date_from": "-1h", "interval": "minute"}, run_at="2020-01-03T13:05:01Z")[0]["data"] + self.assertEqual(data[-2], 1000.0) + self.assertEqual(data[-5], 800.0) + + def test_hour_interval(self): + data = self._run({"date_from": "-1d", "interval": "hour"}, run_at="2020-01-03T13:05:01Z")[0]["data"] + self.assertEqual( + data, + [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1200.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1350.0, + ], + ) + + def test_day_interval(self): + data = self._run({"date_from": "-3d"}, run_at="2020-01-03T13:05:01Z")[0]["data"] + self.assertEqual(data, [0.0, 0.0, 1200.0, 1350.0]) + + def test_week_interval(self): + data = self._run({"date_from": "-2w", "interval": "week"}, run_at="2020-01-03T13:05:01Z")[0]["data"] + self.assertEqual(data, [0.0, 0.0, 2160.0]) + + def test_month_interval(self): + data = self._run({"date_from": "-2m", "interval": "month"}, run_at="2020-01-03T13:05:01Z")[0]["data"] + self.assertEqual(data, [0.0, 0.0, 2160.0]) + + def test_interval_rounding(self): + pass + + def test_formula(self): + self.assertEqual(self._run({"formula": "A - B"})[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 600.0, 450.0, 0.0]) + self.assertEqual(self._run({"formula": "A * B"})[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 270000.0, 405000.0, 0.0]) + self.assertEqual(self._run({"formula": "A / B"})[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 2.0, 0.0]) + self.assertEqual(self._run({"formula": "(A/3600)/B"})[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + self.assertEqual(self._run({"formula": "(A/3600)/B"})[0]["count"], 0) + + self.assertEqual(self._run({"formula": "A/0"})[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + self.assertEqual(self._run({"formula": "A/0"})[0]["count"], 0) + + def test_breakdown(self): + response = self._run({"formula": "A - B", "breakdown": "location"}) + self.assertEqual(response[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 450.0, 0.0]) + self.assertEqual(response[0]["label"], "London") + self.assertEqual(response[1]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 250.0, 0.0, 0.0]) + self.assertEqual(response[1]["label"], "Paris") + + def test_breakdown_counts_of_different_events_one_without_events(self): + with freeze_time("2020-01-04T13:01:01Z"): + response = ClickhouseTrends().run( + Filter( + data={ + "insight": "TRENDS", + "display": "ActionsLineGraph", + "formula": "B / A", + "breakdown": "location", + "breakdown_type": "event", + "events": [ + {"id": "session start", "name": "session start", "type": "events", "order": 0}, + {"id": "session error", "name": "session error", "type": "events", "order": 1}, + ], + } + ), + self.team, + ) + self.assertEqual( + response, + [ + { + "data": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + "count": 0.0, + "labels": [ + "28-Dec-2019", + "29-Dec-2019", + "30-Dec-2019", + "31-Dec-2019", + "1-Jan-2020", + "2-Jan-2020", + "3-Jan-2020", + "4-Jan-2020", + ], + "days": [ + "2019-12-28", + "2019-12-29", + "2019-12-30", + "2019-12-31", + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-04", + ], + "label": "London", + }, + { + "data": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + "count": 0.0, + "labels": [ + "28-Dec-2019", + "29-Dec-2019", + "30-Dec-2019", + "31-Dec-2019", + "1-Jan-2020", + "2-Jan-2020", + "3-Jan-2020", + "4-Jan-2020", + ], + "days": [ + "2019-12-28", + "2019-12-29", + "2019-12-30", + "2019-12-31", + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-01-04", + ], + "label": "Paris", + }, + ], + ) + + def test_breakdown_cohort(self): + cohort = Cohort.objects.create( + team=self.team, name="cohort1", groups=[{"properties": {"$some_prop": "some_val"}}] + ) + response = self._run({"breakdown": ["all", cohort.pk], "breakdown_type": "cohort"}) + self.assertEqual(response[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 1200.0, 1350.0, 0.0]) + self.assertEqual(response[0]["label"], "all users") + self.assertEqual(response[1]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 1200.0, 1350.0, 0.0]) + self.assertEqual(response[1]["label"], "cohort1") + + def test_breakdown_mismatching_sizes(self): + response = self._run( + {"events": [{"id": "session start"}, {"id": "session end"},], "breakdown": "location", "formula": "A + B",} + ) + + self.assertEqual(response[0]["label"], "London") + self.assertEqual(response[0]["data"], [0, 0, 0, 0, 0, 1, 3, 0]) + self.assertEqual(response[1]["label"], "Paris") + self.assertEqual(response[1]["data"], [0, 0, 0, 0, 0, 2, 0, 0]) + + def test_global_properties(self): + self.assertEqual( + self._run({"properties": [{"key": "$current_url", "value": "http://example.org"}]})[0]["data"], + [0.0, 0.0, 0.0, 0.0, 0.0, 400.0, 0.0, 0.0], + ) + + def test_properties_with_escape_params(self): + # regression test + self.assertEqual( + self._run( + { + "properties": [ + { + "key": "$current_url", + "value": "http://localhost:8000/insights?insight=TRENDS&interval=day&display=ActionsLineGraph&actions=%5B%5D&events=%5B%7B%22id%22%3A%22%24pageview%22%2C%22name%22%3A%22%24pageview%22%2C%22type%22%3A%22events%22%2C%22order%22%3A0%7D%2C%7B%22id%22%3A%22%24pageview%2", + } + ] + } + )[0]["data"], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + ) + + def test_event_properties(self): + self.assertEqual( + self._run( + { + "events": [ + { + "id": "session start", + "math": "sum", + "math_property": "session duration", + "properties": [{"key": "$current_url", "value": "http://example.org"}], + }, + {"id": "session start", "math": "avg", "math_property": "session duration"}, + ] + } + )[0]["data"], + [0.0, 0.0, 0.0, 0.0, 0.0, 500.0, 450.0, 0.0], + ) + + def test_compare(self): + response = self._run({"date_from": "-1dStart", "compare": True}) + self.assertEqual(response[0]["data"], [1350.0, 0.0]) + self.assertEqual(response[1]["data"], [0, 1200.0, 1350.0]) + + def test_pie(self): + self.assertEqual(self._run({"display": TRENDS_PIE})[0]["aggregated_value"], 2160.0) + + def test_cumulative(self): + self.assertEqual( + self._run({"display": TRENDS_CUMULATIVE})[0]["data"], [0.0, 0.0, 0.0, 0.0, 0.0, 1200.0, 2550.0, 2550.0] + ) + + def test_multiple_events(self): + # regression test + self.assertEqual( + self._run( + { + "events": [ + {"id": "session start", "math": "sum", "math_property": "session duration"}, + {"id": "session start", "math": "avg", "math_property": "session duration"}, + {"id": "session start", "math": "avg", "math_property": "session duration"}, + ] + } + )[0]["data"], + [0.0, 0.0, 0.0, 0.0, 0.0, 1200.0, 1350.0, 0.0], + ) diff --git a/ee/clickhouse/queries/trends/total_volume.py b/ee/clickhouse/queries/trends/total_volume.py new file mode 100644 index 0000000000000..515a756258909 --- /dev/null +++ b/ee/clickhouse/queries/trends/total_volume.py @@ -0,0 +1,83 @@ +from typing import Any, Callable, Dict, List, Tuple + +from ee.clickhouse.queries.trends.trend_event_query import TrendsEventQuery +from ee.clickhouse.queries.trends.util import enumerate_time_range, parse_response, process_math +from ee.clickhouse.queries.util import ( + format_ch_timestamp, + get_earliest_timestamp, + get_interval_func_ch, + get_time_diff, + get_trunc_func_ch, +) +from ee.clickhouse.sql.events import NULL_SQL +from ee.clickhouse.sql.trends.aggregate import AGGREGATE_SQL +from ee.clickhouse.sql.trends.volume import ACTIVE_USER_SQL, VOLUME_SQL, VOLUME_TOTAL_AGGREGATE_SQL +from posthog.constants import MONTHLY_ACTIVE, TRENDS_DISPLAY_BY_VALUE, WEEKLY_ACTIVE +from posthog.models.entity import Entity +from posthog.models.filters import Filter + + +class ClickhouseTrendsTotalVolume: + def _total_volume_query(self, entity: Entity, filter: Filter, team_id: int) -> Tuple[str, Dict, Callable]: + trunc_func = get_trunc_func_ch(filter.interval) + interval_func = get_interval_func_ch(filter.interval) + _, seconds_in_interval, _ = get_time_diff(filter.interval, filter.date_from, filter.date_to, team_id=team_id) + aggregate_operation, join_condition, math_params = process_math(entity) + + trend_event_query = TrendsEventQuery( + filter=filter, + entity=entity, + team_id=team_id, + should_join_distinct_ids=True + if join_condition != "" or entity.math in [WEEKLY_ACTIVE, MONTHLY_ACTIVE] + else False, + ) + event_query, event_query_params = trend_event_query.get_query() + + content_sql_params = { + "aggregate_operation": aggregate_operation, + "timestamp": "e.timestamp", + "interval": trunc_func, + } + params: Dict = {"team_id": team_id} + params = {**params, **math_params, **event_query_params} + + if filter.display in TRENDS_DISPLAY_BY_VALUE: + content_sql = VOLUME_TOTAL_AGGREGATE_SQL.format(event_query=event_query, **content_sql_params) + time_range = enumerate_time_range(filter, seconds_in_interval) + + return ( + content_sql, + params, + lambda result: [ + {"aggregated_value": result[0][0] if result and len(result) else 0, "days": time_range} + ], + ) + else: + + if entity.math in [WEEKLY_ACTIVE, MONTHLY_ACTIVE]: + content_sql = ACTIVE_USER_SQL.format( + event_query=event_query, + **content_sql_params, + parsed_date_to=trend_event_query.parsed_date_to, + parsed_date_from=trend_event_query.parsed_date_from, + **trend_event_query.active_user_params + ) + else: + content_sql = VOLUME_SQL.format(event_query=event_query, **content_sql_params) + + null_sql = NULL_SQL.format(trunc_func=trunc_func, interval_func=interval_func) + params["interval"] = filter.interval + final_query = AGGREGATE_SQL.format(null_sql=null_sql, content_sql=content_sql) + return final_query, params, self._parse_total_volume_result(filter) + + def _parse_total_volume_result(self, filter: Filter) -> Callable: + def _parse(result: List) -> List: + parsed_results = [] + for _, stats in enumerate(result): + parsed_result = parse_response(stats, filter) + parsed_results.append(parsed_result) + + return parsed_results + + return _parse diff --git a/ee/clickhouse/queries/trends/trend_event_query.py b/ee/clickhouse/queries/trends/trend_event_query.py new file mode 100644 index 0000000000000..a4e2c1f61bd52 --- /dev/null +++ b/ee/clickhouse/queries/trends/trend_event_query.py @@ -0,0 +1,115 @@ +from typing import Any, Dict, Tuple + +from ee.clickhouse.models.entity import get_entity_filtering_params +from ee.clickhouse.queries.event_query import ClickhouseEventQuery +from ee.clickhouse.queries.person_query import ClickhousePersonQuery +from ee.clickhouse.queries.trends.util import get_active_user_params +from ee.clickhouse.queries.util import date_from_clause, get_time_diff, get_trunc_func_ch, parse_timestamps +from posthog.constants import MONTHLY_ACTIVE, WEEKLY_ACTIVE +from posthog.models import Entity +from posthog.models.filters.filter import Filter + + +class TrendsEventQuery(ClickhouseEventQuery): + _entity: Entity + _filter: Filter + + def __init__(self, entity: Entity, *args, **kwargs): + self._entity = entity + super().__init__(*args, **kwargs) + self._person_query = ClickhousePersonQuery( + self._filter, + self._team_id, + self._column_optimizer, + extra_fields=kwargs.get("extra_person_fields", []), + entity=entity, + ) + + def get_query(self) -> Tuple[str, Dict[str, Any]]: + _fields = ( + f"{self.EVENT_TABLE_ALIAS}.timestamp as timestamp" + + ( + " ".join( + f", {self.EVENT_TABLE_ALIAS}.{column_name} as {column_name}" + for column_name in self._column_optimizer.event_columns_to_query + ) + ) + + (f", {self.DISTINCT_ID_TABLE_ALIAS}.person_id as person_id" if self._should_join_distinct_ids else "") + + ( + " ".join( + f", {self.EVENT_TABLE_ALIAS}.{column_name} as {column_name}" for column_name in self._extra_fields + ) + ) + + ( + " ".join( + f", {self.PERSON_TABLE_ALIAS}.{column_name} as {column_name}" + for column_name in self._extra_person_fields + ) + ) + ) + + date_query, date_params = self._get_date_filter() + self.params.update(date_params) + + prop_filters = [*self._filter.properties, *self._entity.properties] + prop_query, prop_params = self._get_props(prop_filters) + self.params.update(prop_params) + + entity_query, entity_params = self._get_entity_query() + self.params.update(entity_params) + + person_query, person_params = self._get_person_query() + self.params.update(person_params) + + groups_query, groups_params = self._get_groups_query() + self.params.update(groups_params) + + query = f""" + SELECT {_fields} FROM events {self.EVENT_TABLE_ALIAS} + {self._get_disintct_id_query()} + {person_query} + {groups_query} + WHERE team_id = %(team_id)s + {entity_query} + {date_query} + {prop_query} + """ + + return query, self.params + + def _determine_should_join_distinct_ids(self) -> None: + if self._entity.math == "dau": + self._should_join_distinct_ids = True + + def _get_date_filter(self) -> Tuple[str, Dict]: + date_filter = "" + date_params: Dict[str, Any] = {} + interval_annotation = get_trunc_func_ch(self._filter.interval) + _, _, round_interval = get_time_diff( + self._filter.interval, self._filter.date_from, self._filter.date_to, team_id=self._team_id + ) + _, parsed_date_to, date_params = parse_timestamps(filter=self._filter, team_id=self._team_id) + parsed_date_from = date_from_clause(interval_annotation, round_interval) + + self.parsed_date_from = parsed_date_from + self.parsed_date_to = parsed_date_to + + if self._entity.math in [WEEKLY_ACTIVE, MONTHLY_ACTIVE]: + date_filter = "{parsed_date_from_prev_range} {parsed_date_to}" + format_params = get_active_user_params(self._filter, self._entity, self._team_id) + self.active_user_params = format_params + + date_filter = date_filter.format(**format_params, parsed_date_to=parsed_date_to) + else: + date_filter = "{parsed_date_from} {parsed_date_to}".format( + parsed_date_from=parsed_date_from, parsed_date_to=parsed_date_to + ) + + return date_filter, date_params + + def _get_entity_query(self) -> Tuple[str, Dict]: + entity_params, entity_format_params = get_entity_filtering_params( + self._entity, self._team_id, table_name=self.EVENT_TABLE_ALIAS + ) + + return entity_format_params["entity_query"], entity_params diff --git a/ee/clickhouse/queries/trends/util.py b/ee/clickhouse/queries/trends/util.py index 81bd0e037a234..94ab208a43d6c 100644 --- a/ee/clickhouse/queries/trends/util.py +++ b/ee/clickhouse/queries/trends/util.py @@ -1,31 +1,48 @@ from datetime import timedelta -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, List, Tuple, Union +from rest_framework.exceptions import ValidationError + +from ee.clickhouse.models.property import get_property_string_expr +from ee.clickhouse.queries.util import format_ch_timestamp, get_earliest_timestamp from ee.clickhouse.sql.events import EVENT_JOIN_PERSON_SQL +from posthog.constants import WEEKLY_ACTIVE from posthog.models.entity import Entity -from posthog.models.filter import Filter +from posthog.models.filters import Filter, PathFilter +from posthog.models.filters.mixins.groups import validate_group_type_index + +MATH_FUNCTIONS = { + "sum": "sum", + "avg": "avg", + "min": "min", + "max": "max", + "median": "quantile(0.50)", + "p90": "quantile(0.90)", + "p95": "quantile(0.95)", + "p99": "quantile(0.99)", +} -def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Optional[str]]]: +def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Any]]: aggregate_operation = "count(*)" - params = {} join_condition = "" - value = "toFloat64OrNull(JSONExtractRaw(properties, '{}'))".format(entity.math_property) + params: Dict[str, Any] = {} if entity.math == "dau": join_condition = EVENT_JOIN_PERSON_SQL aggregate_operation = "count(DISTINCT person_id)" - elif entity.math == "sum": - aggregate_operation = "sum({})".format(value) - params = {"join_property_key": entity.math_property} - elif entity.math == "avg": - aggregate_operation = "avg({})".format(value) - params = {"join_property_key": entity.math_property} - elif entity.math == "min": - aggregate_operation = "min({})".format(value) - params = {"join_property_key": entity.math_property} - elif entity.math == "max": - aggregate_operation = "max({})".format(value) - params = {"join_property_key": entity.math_property} + elif entity.math == "unique_group": + validate_group_type_index("math_group_type_index", entity.math_group_type_index, required=True) + + aggregate_operation = f"count(DISTINCT $group_{entity.math_group_type_index})" + elif entity.math in MATH_FUNCTIONS: + if entity.math_property is None: + raise ValidationError({"math_property": "This field is required when `math` is set."}, code="required") + + key = f"e_{entity.index}_math_prop" + value, _ = get_property_string_expr("events", entity.math_property, f"%({key})s", "properties") + aggregate_operation = f"{MATH_FUNCTIONS[entity.math]}(toFloat64OrNull({value}))" + params["join_property_key"] = entity.math_property + params[key] = entity.math_property return aggregate_operation, join_condition, params @@ -33,28 +50,71 @@ def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Optional[str]]]: def parse_response(stats: Dict, filter: Filter, additional_values: Dict = {}) -> Dict[str, Any]: counts = stats[1] dates = [ - ((item - timedelta(days=1)) if filter.interval == "month" else item).strftime( + item.strftime( "%Y-%m-%d{}".format(", %H:%M" if filter.interval == "hour" or filter.interval == "minute" else "") ) for item in stats[0] ] labels = [ - ((item - timedelta(days=1)) if filter.interval == "month" else item).strftime( - "%a. %-d %B{}".format(", %H:%M" if filter.interval == "hour" or filter.interval == "minute" else "") + item.strftime( + "%-d-%b-%Y{}".format(" %H:%M" if filter.interval == "hour" or filter.interval == "minute" else "") ) for item in stats[0] ] days = [ - ((item - timedelta(days=1)) if filter.interval == "month" else item).strftime( + item.strftime( "%Y-%m-%d{}".format(" %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else "") ) for item in stats[0] ] return { - "data": counts, - "count": sum(counts), - "dates": dates, + "data": [float(c) for c in counts], + "count": float(sum(counts)), "labels": labels, "days": days, **additional_values, } + + +def get_active_user_params(filter: Union[Filter, PathFilter], entity: Entity, team_id: int) -> Dict[str, Any]: + params = {} + params.update({"prev_interval": "7 DAY" if entity.math == WEEKLY_ACTIVE else "30 day"}) + diff = timedelta(days=7) if entity.math == WEEKLY_ACTIVE else timedelta(days=30) + if filter.date_from: + params.update( + { + "parsed_date_from_prev_range": f"AND timestamp >= '{format_ch_timestamp(filter.date_from - diff, filter)}'" + } + ) + else: + try: + earliest_date = get_earliest_timestamp(team_id) + except IndexError: + raise ValidationError("Active User queries require a lower date bound") + else: + params.update( + { + "parsed_date_from_prev_range": f"AND timestamp >= '{format_ch_timestamp(earliest_date - diff, filter)}'" + } + ) + + return params + + +def enumerate_time_range(filter: Filter, seconds_in_interval: int) -> List[str]: + date_from = filter.date_from + date_to = filter.date_to + delta = timedelta(seconds=seconds_in_interval) + time_range: List[str] = [] + + if not date_from or not date_to: + return time_range + + while date_from <= date_to: + time_range.append( + date_from.strftime( + "%Y-%m-%d{}".format(" %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else "") + ) + ) + date_from += delta + return time_range diff --git a/ee/clickhouse/queries/util.py b/ee/clickhouse/queries/util.py index c9fe17ad46945..46863b1c6531f 100644 --- a/ee/clickhouse/queries/util.py +++ b/ee/clickhouse/queries/util.py @@ -1,80 +1,128 @@ -from datetime import datetime, timedelta -from typing import Any, Dict, Optional, Tuple +from datetime import datetime +from typing import Dict, Optional, Tuple, Union +from dateutil.relativedelta import relativedelta from django.utils import timezone +from rest_framework.exceptions import ValidationError from ee.clickhouse.client import sync_execute from ee.clickhouse.sql.events import GET_EARLIEST_TIMESTAMP_SQL -from posthog.models.filter import Filter +from posthog.models.event import DEFAULT_EARLIEST_TIME_DELTA +from posthog.models.filters.sessions_filter import SessionEventsFilter +from posthog.queries.base import TIME_IN_SECONDS +from posthog.types import FilterType -def parse_timestamps(filter: Filter, table: str = "") -> Tuple[str, str]: +def parse_timestamps( + filter: Union[FilterType, SessionEventsFilter], team_id: int, table: str = "" +) -> Tuple[str, str, dict]: date_from = None date_to = None - + params = {} if filter.date_from: - date_from = "and {table}timestamp >= '{}'".format( - filter.date_from.strftime( - "%Y-%m-%d{}".format( - " %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else " 00:00:00" - ) - ), - table=table, - ) + + date_from = f"AND {table}timestamp >= %(date_from)s" + params.update({"date_from": format_ch_timestamp(filter.date_from, filter)}) else: try: - earliest_date = sync_execute(GET_EARLIEST_TIMESTAMP_SQL)[0][0] + earliest_date = get_earliest_timestamp(team_id) except IndexError: date_from = "" else: - date_from = "and {table}timestamp >= '{}'".format( - earliest_date.strftime( - "%Y-%m-%d{}".format( - " %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else " 00:00:00" - ) - ), - table=table, - ) + date_from = f"AND {table}timestamp >= %(date_from)s" + params.update({"date_from": format_ch_timestamp(earliest_date, filter)}) _date_to = filter.date_to - date_to = "and {table}timestamp <= '{}'".format( - _date_to.strftime( - "%Y-%m-%d{}".format( - " %H:%M:%S" if filter.interval == "hour" or filter.interval == "minute" else " 23:59:59" - ), - ), - table=table, + date_to = f"AND {table}timestamp <= %(date_to)s" + params.update({"date_to": format_ch_timestamp(_date_to, filter, " 23:59:59")}) + + return date_from or "", date_to or "", params + + +def format_ch_timestamp(timestamp: datetime, filter, default_hour_min: str = " 00:00:00"): + is_hour_or_min = ( + (filter.interval and filter.interval.lower() == "hour") + or (filter.interval and filter.interval.lower() == "minute") + or (filter._date_from == "-24h") + or (filter._date_from == "-48h") ) - return date_from or "", date_to or "" + return timestamp.strftime("%Y-%m-%d{}".format(" %H:%M:%S" if is_hour_or_min else default_hour_min)) -def get_time_diff(interval: str, start_time: Optional[datetime], end_time: Optional[datetime]) -> Tuple[int, int]: +def get_earliest_timestamp(team_id: int) -> datetime: + results = sync_execute(GET_EARLIEST_TIMESTAMP_SQL, {"team_id": team_id}) + if len(results) > 0: + return results[0][0] + else: + return timezone.now() - DEFAULT_EARLIEST_TIME_DELTA + + +def get_time_diff( + interval: str, start_time: Optional[datetime], end_time: Optional[datetime], team_id: int +) -> Tuple[int, int, bool]: - _start_time = start_time or sync_execute(GET_EARLIEST_TIMESTAMP_SQL)[0][0] + _start_time = start_time or get_earliest_timestamp(team_id) _end_time = end_time or timezone.now() - time_diffs: Dict[str, Any] = { - "minute": 60, - "hour": 3600, - "day": 3600 * 24, - "week": 3600 * 24 * 7, - "month": 3600 * 24 * 30, - } + if interval == "month": + rel_delta = relativedelta(_end_time.replace(day=1), _start_time.replace(day=1)) + return (rel_delta.years * 12) + rel_delta.months + 1, TIME_IN_SECONDS["month"], True diff = _end_time - _start_time - return int(diff.total_seconds() / time_diffs[interval]) + 1, time_diffs[interval] + if interval == "week": + round_interval = True + else: + round_interval = diff.total_seconds() >= TIME_IN_SECONDS[interval] * 2 + return ( + # NOTE: `int` will simply strip the decimal part. Checking the + # extremities, if start_time, end_time are less than an interval apart, + # we'll get 0, then add 1, so we'll always get at least one interval + int(diff.total_seconds() / TIME_IN_SECONDS[interval]) + 1, + TIME_IN_SECONDS[interval], + round_interval, + ) -def get_interval_annotation_ch(interval: Optional[str]) -> str: - if interval is None: - return "toStartOfDay" - map: Dict[str, str] = { - "minute": "toStartOfMinute", - "hour": "toStartOfHour", - "day": "toStartOfDay", - "week": "toStartOfWeek", - "month": "toStartOfMonth", - } - return map[interval] +PERIOD_TO_TRUNC_FUNC: Dict[str, str] = { + "minute": "toStartOfMinute", + "hour": "toStartOfHour", + "week": "toStartOfWeek", + "day": "toStartOfDay", + "month": "toStartOfMonth", +} + + +def get_trunc_func_ch(period: Optional[str]) -> str: + if period is None: + period = "day" + ch_function = PERIOD_TO_TRUNC_FUNC.get(period.lower()) + if ch_function is None: + raise ValidationError(f"Period {period} is unsupported.") + return ch_function + + +PERIOD_TO_INTERVAL_FUNC: Dict[str, str] = { + "minute": "toIntervalMinute", + "hour": "toIntervalHour", + "week": "toIntervalWeek", + "day": "toIntervalDay", + "month": "toIntervalMonth", +} + + +def get_interval_func_ch(period: Optional[str]) -> str: + if period is None: + period = "day" + ch_function = PERIOD_TO_INTERVAL_FUNC.get(period.lower()) + if ch_function is None: + raise ValidationError(f"Interval {period} is unsupported.") + return ch_function + + +def date_from_clause(interval_annotation: str, round_interval: bool) -> str: + if round_interval: + return "AND {interval}(timestamp) >= {interval}(toDateTime(%(date_from)s))".format(interval=interval_annotation) + else: + return "AND timestamp >= %(date_from)s" diff --git a/ee/clickhouse/sql/clickhouse.py b/ee/clickhouse/sql/clickhouse.py index 25f010a100ae6..318a2e8be4d44 100644 --- a/ee/clickhouse/sql/clickhouse.py +++ b/ee/clickhouse/sql/clickhouse.py @@ -3,18 +3,24 @@ from posthog.settings import CLICKHOUSE_ENABLE_STORAGE_POLICY, CLICKHOUSE_REPLICATION, KAFKA_HOSTS, TEST STORAGE_POLICY = "SETTINGS storage_policy = 'hot_to_cold'" if CLICKHOUSE_ENABLE_STORAGE_POLICY else "" -TABLE_ENGINE = ( +REPLACING_TABLE_ENGINE = ( "ReplicatedReplacingMergeTree('/clickhouse/tables/{{shard}}/posthog.{table}', '{{replica}}', {ver})" if CLICKHOUSE_REPLICATION else "ReplacingMergeTree({ver})" ) -TABLE_MERGE_ENGINE = ( +MERGE_TABLE_ENGINE = ( "ReplicatedReplacingMergeTree('/clickhouse/tables/{{shard}}/posthog.{table}', '{{replica}}')" if CLICKHOUSE_REPLICATION else "MergeTree()" ) +COLLAPSING_TABLE_ENGINE = ( + "ReplicatedCollapsingMergeTree('/clickhouse/tables/noshard/posthog.{table}', '{{replica}}-{{shard}}', {ver})" + if CLICKHOUSE_REPLICATION + else "CollapsingMergeTree({ver})" +) + KAFKA_ENGINE = "Kafka('{kafka_host}', '{topic}', '{group}', '{serialization}')" KAFKA_PROTO_ENGINE = """ @@ -36,12 +42,17 @@ , _offset UInt64 """ +COLLAPSING_MERGE_TREE = "collapsing_merge_tree" +REPLACING_MERGE_TREE = "replacing_merge_tree" + -def table_engine(table: str, ver: Optional[str] = None) -> str: - if ver: - return TABLE_ENGINE.format(table=table, ver=ver) +def table_engine(table: str, ver: Optional[str] = None, engine_type: Optional[str] = None) -> str: + if engine_type == COLLAPSING_MERGE_TREE and ver: + return COLLAPSING_TABLE_ENGINE.format(table=table, ver=ver) + elif engine_type == REPLACING_MERGE_TREE and ver: + return REPLACING_TABLE_ENGINE.format(table=table, ver=ver) else: - return TABLE_MERGE_ENGINE.format(table=table) + return MERGE_TABLE_ENGINE.format(table=table) def kafka_engine( @@ -64,5 +75,5 @@ def kafka_engine( ) -def ttl_period(): - return "" if TEST else "TTL toDate(created_at) + INTERVAL 3 WEEK" +def ttl_period(field: str = "created_at", weeks: int = 3): + return "" if TEST else f"TTL toDate({field}) + INTERVAL {weeks} WEEK" diff --git a/ee/clickhouse/sql/cohort.py b/ee/clickhouse/sql/cohort.py index 6fb1625fa6741..c5eac3b465e2f 100644 --- a/ee/clickhouse/sql/cohort.py +++ b/ee/clickhouse/sql/cohort.py @@ -1,3 +1,79 @@ +from posthog.settings import CLICKHOUSE_CLUSTER + +from .clickhouse import COLLAPSING_MERGE_TREE, table_engine + CALCULATE_COHORT_PEOPLE_SQL = """ -SELECT distinct_id FROM person_distinct_id where {query} AND team_id = %(team_id)s +SELECT {id_column} FROM ({GET_TEAM_PERSON_DISTINCT_IDS}) WHERE {query} +""" + +CREATE_COHORTPEOPLE_TABLE_SQL = """ +CREATE TABLE IF NOT EXISTS cohortpeople ON CLUSTER {cluster} +( + person_id UUID, + cohort_id Int64, + team_id Int64, + sign Int8 +) ENGINE = {engine} +Order By (team_id, cohort_id, person_id) +{storage_policy} +""".format( + cluster=CLICKHOUSE_CLUSTER, engine=table_engine("cohortpeople", "sign", COLLAPSING_MERGE_TREE), storage_policy="" +) + +TRUNCATE_COHORTPEOPLE_TABLE_SQL = f"TRUNCATE TABLE IF EXISTS cohortpeople ON CLUSTER {CLICKHOUSE_CLUSTER}" +DROP_COHORTPEOPLE_TABLE_SQL = f"DROP TABLE IF EXISTS cohortpeople ON CLUSTER {CLICKHOUSE_CLUSTER}" + +REMOVE_PEOPLE_NOT_MATCHING_COHORT_ID_SQL = """ +INSERT INTO cohortpeople +SELECT person_id, cohort_id, %(team_id)s as team_id, -1 as _sign +FROM cohortpeople +JOIN ( + SELECT id, argMax(properties, person._timestamp) as properties, sum(is_deleted) as is_deleted FROM person WHERE team_id = %(team_id)s GROUP BY id +) as person ON (person.id = cohortpeople.person_id) +WHERE cohort_id = %(cohort_id)s +AND + ( + person.is_deleted = 1 OR NOT person_id IN ({cohort_filter}) + ) +""" + +GET_COHORT_SIZE_SQL = """ +SELECT count(*) +FROM ( + SELECT 1 + FROM cohortpeople + WHERE team_id = %(team_id)s AND cohort_id = %(cohort_id)s + GROUP BY person_id, cohort_id, team_id + HAVING sum(sign) > 0 +) +""" + +INSERT_PEOPLE_MATCHING_COHORT_ID_SQL = """ +INSERT INTO cohortpeople + SELECT id, %(cohort_id)s as cohort_id, %(team_id)s as team_id, 1 as _sign + FROM ( + SELECT id, argMax(properties, person._timestamp) as properties, sum(is_deleted) as is_deleted FROM person WHERE team_id = %(team_id)s GROUP BY id + ) as person + LEFT JOIN ( + SELECT person_id, sum(sign) AS sign FROM cohortpeople WHERE cohort_id = %(cohort_id)s AND team_id = %(team_id)s GROUP BY person_id + ) as cohortpeople ON (person.id = cohortpeople.person_id) + WHERE (cohortpeople.person_id = '00000000-0000-0000-0000-000000000000' OR sign = 0) + AND person.is_deleted = 0 + AND id IN ({cohort_filter}) +""" + +GET_DISTINCT_ID_BY_ENTITY_SQL = """ +SELECT distinct_id FROM events WHERE team_id = %(team_id)s {date_query} AND {entity_query} +""" + +GET_PERSON_ID_BY_ENTITY_COUNT_SQL = """ +SELECT person_id FROM events +INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) as pdi +ON events.distinct_id = pdi.distinct_id +WHERE team_id = %(team_id)s {date_query} AND {entity_query} +GROUP BY person_id HAVING count(*) {count_operator} %(count)s +""" + +GET_PERSON_ID_BY_PRECALCULATED_COHORT_ID = """ +SELECT person_id FROM cohortpeople WHERE team_id = %(team_id)s AND cohort_id = %({prepend}_cohort_id_{index})s GROUP BY person_id, cohort_id, team_id HAVING sum(sign) > 0 """ diff --git a/ee/clickhouse/sql/dead_letter_queue.py b/ee/clickhouse/sql/dead_letter_queue.py new file mode 100644 index 0000000000000..76088a54fb11b --- /dev/null +++ b/ee/clickhouse/sql/dead_letter_queue.py @@ -0,0 +1,110 @@ +from ee.clickhouse.sql.clickhouse import KAFKA_COLUMNS, REPLACING_MERGE_TREE, kafka_engine, table_engine, ttl_period +from ee.kafka_client.topics import KAFKA_DEAD_LETTER_QUEUE +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE + +# We pipe our Kafka dead letter queue into CH for easier analysis and longer retention +# This allows us to explore errors and replay events with ease + +DEAD_LETTER_QUEUE_TABLE = "events_dead_letter_queue" + +DEAD_LETTER_QUEUE_TABLE_BASE_SQL = """ +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} +( + id UUID, + event_uuid UUID, + event VARCHAR, + properties VARCHAR, + distinct_id VARCHAR, + team_id Int64, + elements_chain VARCHAR, + created_at DateTime64(6, 'UTC'), + ip VARCHAR, + site_url VARCHAR, + now DateTime64(6, 'UTC'), + raw_payload VARCHAR, + error_timestamp DateTime64(6, 'UTC'), + error_location VARCHAR, + error VARCHAR + {extra_fields} +) ENGINE = {engine} +""" + +DEAD_LETTER_QUEUE_TABLE_SQL = ( + DEAD_LETTER_QUEUE_TABLE_BASE_SQL + + """ORDER BY (id, event_uuid, distinct_id, team_id) +{ttl_period} +SETTINGS index_granularity=512 +""" +).format( + table_name=DEAD_LETTER_QUEUE_TABLE, + cluster=CLICKHOUSE_CLUSTER, + extra_fields=KAFKA_COLUMNS, + engine=table_engine(DEAD_LETTER_QUEUE_TABLE, "_timestamp", REPLACING_MERGE_TREE), + ttl_period=ttl_period("_timestamp", 4), # 4 weeks +) + +KAFKA_DEAD_LETTER_QUEUE_TABLE_SQL = DEAD_LETTER_QUEUE_TABLE_BASE_SQL.format( + table_name="kafka_" + DEAD_LETTER_QUEUE_TABLE, + cluster=CLICKHOUSE_CLUSTER, + engine=kafka_engine(topic=KAFKA_DEAD_LETTER_QUEUE), + extra_fields="", +) + +DEAD_LETTER_QUEUE_TABLE_MV_SQL = """ +CREATE MATERIALIZED VIEW IF NOT EXISTS {table_name}_mv ON CLUSTER {cluster} +TO {database}.{table_name} +AS SELECT +id, +event_uuid, +event, +properties, +distinct_id, +team_id, +elements_chain, +created_at, +ip, +site_url, +now, +raw_payload, +error_timestamp, +error_location, +error, +_timestamp, +_offset +FROM {database}.kafka_{table_name} +""".format( + table_name=DEAD_LETTER_QUEUE_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE, +) + + +INSERT_DEAD_LETTER_QUEUE_EVENT_SQL = """ +INSERT INTO events_dead_letter_queue +SELECT +%(id)s, +%(event_uuid)s, +%(event)s, +%(properties)s, +%(distinct_id)s, +%(team_id)s, +%(elements_chain)s, +%(created_at)s, +%(ip)s, +%(site_url)s, +%(now)s, +%(raw_payload)s, +%(error_timestamp)s, +%(error_location)s, +%(error)s, +0, +now() +""" + +TRUNCATE_DEAD_LETTER_QUEUE_TABLE_SQL = ( + f"TRUNCATE TABLE IF EXISTS {DEAD_LETTER_QUEUE_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" +) +DROP_KAFKA_DEAD_LETTER_QUEUE_TABLE_SQL = ( + f"DROP TABLE IF EXISTS kafka_{DEAD_LETTER_QUEUE_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" +) +TRUNCATE_DEAD_LETTER_QUEUE_TABLE_MV_SQL = ( + f"TRUNCATE TABLE IF EXISTS {DEAD_LETTER_QUEUE_TABLE}_mv ON CLUSTER {CLICKHOUSE_CLUSTER}" +) diff --git a/ee/clickhouse/sql/events.py b/ee/clickhouse/sql/events.py index 125e418e81a11..06a5b37650911 100644 --- a/ee/clickhouse/sql/events.py +++ b/ee/clickhouse/sql/events.py @@ -1,27 +1,16 @@ from ee.kafka_client.topics import KAFKA_EVENTS +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE, DEBUG -from .clickhouse import KAFKA_COLUMNS, STORAGE_POLICY, kafka_engine, table_engine - -DROP_EVENTS_TABLE_SQL = """ -DROP TABLE events -""" - -DROP_EVENTS_WITH_ARRAY_PROPS_TABLE_SQL = """ -DROP TABLE events_with_array_props_view -""" - -DROP_MAT_EVENTS_WITH_ARRAY_PROPS_TABLE_SQL = """ -DROP TABLE events_with_array_props_mv -""" - -DROP_MAT_EVENTS_PROP_TABLE_SQL = """ -DROP TABLE events_properties_view -""" +from .clickhouse import KAFKA_COLUMNS, REPLACING_MERGE_TREE, STORAGE_POLICY, kafka_engine, table_engine +from .person import GET_TEAM_PERSON_DISTINCT_IDS EVENTS_TABLE = "events" +TRUNCATE_EVENTS_TABLE_SQL = f"TRUNCATE TABLE IF EXISTS {EVENTS_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" +DROP_EVENTS_TABLE_SQL = f"DROP TABLE IF EXISTS {EVENTS_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" + EVENTS_TABLE_BASE_SQL = """ -CREATE TABLE {table_name} +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} ( uuid UUID, event VARCHAR, @@ -31,33 +20,49 @@ distinct_id VARCHAR, elements_chain VARCHAR, created_at DateTime64(6, 'UTC') + {materialized_columns} {extra_fields} -) ENGINE = {engine} +) ENGINE = {engine} +""" + +EVENTS_TABLE_MATERIALIZED_COLUMNS = """ + , $group_0 VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, '$group_0')) COMMENT 'column_materializer::$group_0' + , $group_1 VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, '$group_1')) COMMENT 'column_materializer::$group_1' + , $group_2 VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, '$group_2')) COMMENT 'column_materializer::$group_2' + , $group_3 VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, '$group_3')) COMMENT 'column_materializer::$group_3' + , $group_4 VARCHAR materialized trim(BOTH '\"' FROM JSONExtractRaw(properties, '$group_4')) COMMENT 'column_materializer::$group_4' """ EVENTS_TABLE_SQL = ( EVENTS_TABLE_BASE_SQL + """PARTITION BY toYYYYMM(timestamp) ORDER BY (team_id, toDate(timestamp), distinct_id, uuid) -SAMPLE BY uuid +{sample_by_uuid} {storage_policy} """ ).format( table_name=EVENTS_TABLE, - engine=table_engine(EVENTS_TABLE, "_timestamp"), + cluster=CLICKHOUSE_CLUSTER, + engine=table_engine(EVENTS_TABLE, "_timestamp", REPLACING_MERGE_TREE), extra_fields=KAFKA_COLUMNS, + materialized_columns=EVENTS_TABLE_MATERIALIZED_COLUMNS, + sample_by_uuid="SAMPLE BY uuid" if not DEBUG else "", # https://github.com/PostHog/posthog/issues/5684 storage_policy=STORAGE_POLICY, ) KAFKA_EVENTS_TABLE_SQL = EVENTS_TABLE_BASE_SQL.format( table_name="kafka_" + EVENTS_TABLE, + cluster=CLICKHOUSE_CLUSTER, engine=kafka_engine(topic=KAFKA_EVENTS, serialization="Protobuf", proto_schema="events:Event"), extra_fields="", + materialized_columns="", ) +# You must include the database here because of a bug in clickhouse +# related to https://github.com/ClickHouse/ClickHouse/issues/10471 EVENTS_TABLE_MV_SQL = """ -CREATE MATERIALIZED VIEW {table_name}_mv -TO {table_name} +CREATE MATERIALIZED VIEW {table_name}_mv ON CLUSTER {cluster} +TO {database}.{table_name} AS SELECT uuid, event, @@ -69,170 +74,138 @@ created_at, _timestamp, _offset -FROM kafka_{table_name} +FROM {database}.kafka_{table_name} """.format( - table_name=EVENTS_TABLE + table_name=EVENTS_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE, ) INSERT_EVENT_SQL = """ -INSERT INTO events SELECT %(uuid)s, %(event)s, %(properties)s, %(timestamp)s, %(team_id)s, %(distinct_id)s, %(elements_chain)s, %(created_at)s, now(), 0 +INSERT INTO events (uuid, event, properties, timestamp, team_id, distinct_id, elements_chain, created_at, _timestamp, _offset) +SELECT %(uuid)s, %(event)s, %(properties)s, %(timestamp)s, %(team_id)s, %(distinct_id)s, %(elements_chain)s, %(created_at)s, now(), 0 """ GET_EVENTS_SQL = """ SELECT - ewap.uuid, - ewap.event, - ewap.properties, - ewap.timestamp, - ewap.team_id, - ewap.distinct_id, - ewap.elements_chain, - ewap.created_at -FROM events_with_array_props_view as ewap + uuid, + event, + properties, + timestamp, + team_id, + distinct_id, + elements_chain, + created_at +FROM events """ GET_EVENTS_BY_TEAM_SQL = """ SELECT - ewap.uuid, - ewap.event, - ewap.properties, - ewap.timestamp, - ewap.team_id, - ewap.distinct_id, - ewap.elements_chain, - ewap.created_at -FROM events_with_array_props_view as ewap WHERE team_id = %(team_id)s -""" - -EVENTS_WITH_PROPS_TABLE_SQL = """ -CREATE TABLE events_with_array_props_view -( - uuid UUID, - event VARCHAR, - properties VARCHAR, - timestamp DateTime64(6, 'UTC'), - team_id Int64, - distinct_id VARCHAR, - elements_chain VARCHAR, - created_at DateTime64, - array_property_keys Array(VARCHAR), - array_property_values Array(VARCHAR), - _timestamp UInt64, - _offset UInt64 -) ENGINE = {engine} -PARTITION BY toYYYYMM(timestamp) -ORDER BY (team_id, toDate(timestamp), distinct_id, uuid) -SAMPLE BY uuid -{storage_policy} -""".format( - engine=table_engine("events_with_array_props_view", "_timestamp"), storage_policy=STORAGE_POLICY -) - -MAT_EVENTS_WITH_PROPS_TABLE_SQL = """ -CREATE MATERIALIZED VIEW events_with_array_props_mv -TO events_with_array_props_view -AS SELECT -uuid, -event, -properties, -timestamp, -team_id, -distinct_id, -elements_chain, -created_at, -arrayMap(k -> toString(k.1), JSONExtractKeysAndValuesRaw(properties)) array_property_keys, -arrayMap(k -> toString(k.2), JSONExtractKeysAndValuesRaw(properties)) array_property_values, -_timestamp, -_offset -FROM events -""" - -MAT_EVENT_PROP_TABLE_SQL = """ -CREATE MATERIALIZED VIEW events_properties_view -ENGINE = MergeTree() -ORDER BY (team_id, key, value, event_id) -AS SELECT uuid as event_id, -team_id, -array_property_keys as key, -array_property_values as value -from events_with_array_props_view -ARRAY JOIN array_property_keys, array_property_values + uuid, + event, + properties, + timestamp, + team_id, + distinct_id, + elements_chain, + created_at +FROM events WHERE team_id = %(team_id)s """ SELECT_PROP_VALUES_SQL = """ -SELECT DISTINCT trim(BOTH '\"' FROM value) FROM events_properties_view where key = %(key)s AND team_id = %(team_id)s LIMIT 50 +SELECT DISTINCT trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s)) FROM events where JSONHas(properties, %(key)s) AND team_id = %(team_id)s {parsed_date_from} {parsed_date_to} LIMIT 10 """ SELECT_PROP_VALUES_SQL_WITH_FILTER = """ -SELECT DISTINCT trim(BOTH '\"' FROM value) FROM events_properties_view where key = %(key)s AND team_id = %(team_id)s AND trim(BOTH '\"' FROM value) LIKE %(value)s LIMIT 50 +SELECT DISTINCT trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s)) FROM events where team_id = %(team_id)s AND trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s)) LIKE %(value)s {parsed_date_from} {parsed_date_to} LIMIT 10 """ -SELECT_EVENT_WITH_ARRAY_PROPS_SQL = """ +SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL = """ SELECT - ewap.uuid, - ewap.event, - ewap.properties, - ewap.timestamp, - ewap.team_id, - ewap.distinct_id, - ewap.elements_chain, - ewap.created_at + uuid, + event, + properties, + timestamp, + team_id, + distinct_id, + elements_chain, + created_at FROM - events_with_array_props_view ewap -where ewap.team_id = %(team_id)s + events +where team_id = %(team_id)s {conditions} -ORDER BY toDate(ewap.timestamp) DESC, ewap.timestamp DESC {limit} +ORDER BY toDate(timestamp) {order}, timestamp {order} {limit} """ -SELECT_EVENT_WITH_PROP_SQL = """ +SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL = """ SELECT - ewap.uuid, - ewap.event, - ewap.properties, - ewap.timestamp, - ewap.team_id, - ewap.distinct_id, - ewap.elements_chain, - ewap.created_at -FROM events_with_array_props_view AS ewap -WHERE + uuid, + event, + properties, + timestamp, + team_id, + distinct_id, + elements_chain, + created_at +FROM events +WHERE team_id = %(team_id)s {conditions} {filters} -ORDER BY toDate(ewap.timestamp) DESC, ewap.timestamp DESC {limit} +ORDER BY toDate(timestamp) {order}, timestamp {order} {limit} """ SELECT_ONE_EVENT_SQL = """ SELECT - ewap.uuid, - ewap.event, - ewap.properties, - ewap.timestamp, - ewap.team_id, - ewap.distinct_id, - ewap.elements_chain, - ewap.created_at -FROM events_with_array_props_view ewap WHERE uuid = %(event_id)s AND team_id = %(team_id)s + uuid, + event, + properties, + timestamp, + team_id, + distinct_id, + elements_chain, + created_at +FROM events WHERE uuid = %(event_id)s AND team_id = %(team_id)s """ GET_EARLIEST_TIMESTAMP_SQL = """ -SELECT timestamp from events order by toDate(timestamp), timestamp limit 1 +SELECT timestamp from events WHERE team_id = %(team_id)s order by toDate(timestamp), timestamp limit 1 """ NULL_SQL = """ -SELECT toUInt16(0) AS total, {interval}(toDateTime('{date_to}') - number * {seconds_in_interval}) as day_start from numbers({num_intervals}) -""" - -NULL_BREAKDOWN_SQL = """ -SELECT toUInt16(0) AS total, {interval}(toDateTime('{date_to}') - number * {seconds_in_interval}) as day_start, breakdown_value from numbers({num_intervals}) -""" - -EVENT_JOIN_PERSON_SQL = """ -INNER JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as pid ON events.distinct_id = pid.distinct_id +-- Creates zero values for all date axis ticks for the given date_from, date_to range +SELECT toUInt16(0) AS total, {trunc_func}(toDateTime(%(date_to)s) - {interval_func}(number)) AS day_start + +-- Get the number of `intervals` between date_from and date_to. +-- +-- NOTE: for week there is some unusual behavior, see: +-- https://github.com/ClickHouse/ClickHouse/issues/7322 +-- +-- This actually aligns with what we want, as they are assuming Sunday week starts, +-- and we'd rather have the relative week num difference. Likewise the same for +-- "month" intervals +-- +-- To ensure we get all relevant intervals, we add in the truncated "date_from" +-- value. +-- +-- This behaviour of dateDiff is different to our handling of "week" and "month" +-- differences we are performing in python, which just considers seconds between +-- date_from and date_to +-- +-- TODO: Ths pattern of generating intervals is repeated in several places. Reuse this +-- `ticks` query elsewhere. +FROM numbers(dateDiff(%(interval)s, toDateTime(%(date_from)s), toDateTime(%(date_to)s))) + +UNION ALL + +-- Make sure we capture the interval date_from falls into. +SELECT toUInt16(0) AS total, {trunc_func}(toDateTime(%(date_from)s)) +""" + +EVENT_JOIN_PERSON_SQL = f""" +INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) as pdi ON events.distinct_id = pdi.distinct_id """ GET_EVENTS_WITH_PROPERTIES = """ -SELECT * FROM events WHERE +SELECT * FROM events WHERE team_id = %(team_id)s {filters} {order_by} @@ -254,8 +227,8 @@ tag_regex=EXTRACT_TAG_REGEX, text_regex=EXTRACT_TEXT_REGEX ) -GET_PROPERTIES_VOLUME = """ - SELECT arrayJoin(array_property_keys) as key, count(1) as count FROM events_with_array_props_view WHERE team_id = %(team_id)s AND timestamp > %(timestamp)s GROUP BY key ORDER BY count DESC +GET_CUSTOM_EVENTS = """ +SELECT DISTINCT event FROM events where team_id = %(team_id)s AND event NOT IN ['$autocapture', '$pageview', '$identify', '$pageleave', '$screen'] """ GET_EVENTS_VOLUME = "SELECT event, count(1) as count FROM events WHERE team_id = %(team_id)s AND timestamp > %(timestamp)s GROUP BY event ORDER BY count DESC" diff --git a/ee/clickhouse/sql/funnels/funnel.py b/ee/clickhouse/sql/funnels/funnel.py index 6b8997848b2ed..4169dc59d949a 100644 --- a/ee/clickhouse/sql/funnels/funnel.py +++ b/ee/clickhouse/sql/funnels/funnel.py @@ -1,23 +1,27 @@ -FUNNEL_SQL = """ -SELECT max_step, count(1), groupArray(100)(id) FROM ( - SELECT - pid.person_id as id, - windowFunnel(6048000000000000)(toUInt64(toUnixTimestamp64Micro(timestamp)), - {steps} - ) as max_step - FROM - events - JOIN ( - SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s - ) as pid - ON pid.distinct_id = events.distinct_id - WHERE - team_id = %(team_id)s {filters} {parsed_date_from} {parsed_date_to} - AND event IN %(events)s - GROUP BY pid.person_id +FUNNEL_PERSONS_BY_STEP_SQL = """ +SELECT aggregation_target as person_id {extra_fields} +FROM ( + {steps_per_person_query} ) -WHERE max_step > 0 -GROUP BY max_step -ORDER BY max_step ASC -; +WHERE {persons_steps} +ORDER BY aggregation_target +{limit} +OFFSET {offset} +SETTINGS allow_experimental_window_functions = 1 +""" + +FUNNEL_INNER_EVENT_STEPS_QUERY = """ +SELECT +aggregation_target, +timestamp, +{steps} +{select_prop} +FROM ( + {event_query} +) events +{extra_join} +WHERE ( + {steps_condition} +) +{extra_conditions} """ diff --git a/ee/clickhouse/sql/groups.py b/ee/clickhouse/sql/groups.py new file mode 100644 index 0000000000000..b2b86e59f336f --- /dev/null +++ b/ee/clickhouse/sql/groups.py @@ -0,0 +1,62 @@ +from ee.kafka_client.topics import KAFKA_GROUPS +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE + +from .clickhouse import KAFKA_COLUMNS, REPLACING_MERGE_TREE, STORAGE_POLICY, kafka_engine, table_engine + +GROUPS_TABLE = "groups" + +DROP_GROUPS_TABLE_SQL = f"DROP TABLE {GROUPS_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" + +GROUPS_TABLE_BASE_SQL = """ +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} +( + group_type_index UInt8, + group_key VARCHAR, + created_at DateTime64, + team_id Int64, + group_properties VARCHAR + {extra_fields} +) ENGINE = {engine} +""" + +GROUPS_TABLE_SQL = ( + GROUPS_TABLE_BASE_SQL + + """Order By (team_id, group_type_index, group_key) +{storage_policy} +""" +).format( + table_name=GROUPS_TABLE, + cluster=CLICKHOUSE_CLUSTER, + engine=table_engine(GROUPS_TABLE, "_timestamp", REPLACING_MERGE_TREE), + extra_fields=KAFKA_COLUMNS, + storage_policy=STORAGE_POLICY, +) + +KAFKA_GROUPS_TABLE_SQL = GROUPS_TABLE_BASE_SQL.format( + table_name="kafka_" + GROUPS_TABLE, cluster=CLICKHOUSE_CLUSTER, engine=kafka_engine(KAFKA_GROUPS), extra_fields="", +) + +# You must include the database here because of a bug in clickhouse +# related to https://github.com/ClickHouse/ClickHouse/issues/10471 +GROUPS_TABLE_MV_SQL = f""" +CREATE MATERIALIZED VIEW {GROUPS_TABLE}_mv ON CLUSTER {CLICKHOUSE_CLUSTER} +TO {CLICKHOUSE_DATABASE}.{GROUPS_TABLE} +AS SELECT +group_type_index, +group_key, +created_at, +team_id, +group_properties, +_timestamp, +_offset +FROM {CLICKHOUSE_DATABASE}.kafka_{GROUPS_TABLE} +""" + +# { ..., "group_0": 1325 } +# To join with events join using $group_{group_type_index} column + +TRUNCATE_GROUPS_TABLE_SQL = f"TRUNCATE TABLE IF EXISTS {GROUPS_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" + +INSERT_GROUP_SQL = """ +INSERT INTO groups (group_type_index, group_key, team_id, group_properties, created_at, _timestamp, _offset) SELECT %(group_type_index)s, %(group_key)s, %(team_id)s, %(group_properties)s, %(created_at)s, %(_timestamp)s, 0 +""" diff --git a/ee/clickhouse/sql/paths/path.py b/ee/clickhouse/sql/paths/path.py index 67aa0a7c142f9..7ee3a8d8fe8e2 100644 --- a/ee/clickhouse/sql/paths/path.py +++ b/ee/clickhouse/sql/paths/path.py @@ -1,136 +1,45 @@ -# Step 1. Make a table with the following fields from events: -# -# - person_id = dedupe event distinct_ids into person_id -# - timestamp -# - path_type = either name of event or $current_url or ... -# - new_session = this is 1 when the event is from a new session -# or 0 if it's less than 30min after and for the same person_id as the previous event -# - marked_session_start = this is the same as "new_session" if no start point given, otherwise it's 1 if -# the current event is the start point (e.g. path_start=/about) or 0 otherwise -paths_query_step_1 = """ - SELECT - person_id, - timestamp, - event_id, - path_type, - neighbor(person_id, -1) != person_id OR dateDiff('minute', toDateTime(neighbor(timestamp, -1)), toDateTime(timestamp)) > 30 AS new_session, - {marked_session_start} as marked_session_start - FROM ( - SELECT - timestamp, - person_id, - events.uuid AS event_id, - {path_type} AS path_type - {select_elements_chain} - FROM events_with_array_props_view AS events - JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id - WHERE - events.team_id = %(team_id)s - AND {event_query} - {filters} - {parsed_date_from} - {parsed_date_to} - GROUP BY - person_id, - timestamp, - event_id, - path_type - {group_by_elements_chain} - ORDER BY - person_id, - timestamp - ) - WHERE {excess_row_filter} +PATH_ARRAY_QUERY = """ + SELECT person_id, + path, + conversion_time, + event_in_session_index, + concat(toString(event_in_session_index), '_', path) as path_key, + if(event_in_session_index > 1, concat(toString(event_in_session_index-1), '_', prev_path), null) AS last_path_key, + path_dropoff_key + FROM ( + + SELECT person_id + , joined_path_tuple.1 as path + , joined_path_tuple.2 as conversion_time + , joined_path_tuple.3 as prev_path + , event_in_session_index + , session_index + , arrayPopFront(arrayPushBack(path_basic, '')) as path_basic_0 + , arrayMap((x,y) -> if(x=y, 0, 1), path_basic, path_basic_0) as mapping + , arrayFilter((x,y) -> y, time, mapping) as timings + , arrayFilter((x,y)->y, path_basic, mapping) as compact_path + {target_clause} + , arrayDifference(limited_timings) as timings_diff + , arrayZip(limited_path, timings_diff, arrayPopBack(arrayPushFront(limited_path, ''))) as limited_path_timings + , concat(toString(length(limited_path)), '_', limited_path[-1]) as path_dropoff_key /* last path item */ + FROM ( + SELECT person_id + , path_time_tuple.1 as path_basic + , path_time_tuple.2 as time + , session_index + , arrayZip(paths, timing, arrayDifference(timing)) as paths_tuple + , {session_threshold_clause} as session_paths + FROM ( + SELECT person_id, + groupArray(toUnixTimestamp64Milli(timestamp)) as timing, + groupArray(path_item) as paths + FROM ({path_event_query}) + GROUP BY person_id + ) + /* this array join splits paths for a single personID per session */ + ARRAY JOIN session_paths AS path_time_tuple, arrayEnumerate(session_paths) AS session_index + ) + ARRAY JOIN limited_path_timings AS joined_path_tuple, arrayEnumerate(limited_path_timings) AS event_in_session_index + {boundary_event_filter} + ) """ - -# Step 2. -# - Convert new_session = {1 or 0} into -# ---> session_id = {1, 2, 3...} -# - Remove all "marked_session_start = 0" rows at the start of a session -paths_query_step_2 = """ - SELECT - person_id, - event_id, - timestamp, - path_type, - runningAccumulate(session_id_sumstate) as session_id - FROM ( - SELECT - *, - sumState(new_session) AS session_id_sumstate - FROM - ({paths_query}) - GROUP BY - person_id, - timestamp, - event_id, - path_type, - new_session, - marked_session_start - ORDER BY - person_id, - timestamp - ) - WHERE - marked_session_start = 1 or - (neighbor(marked_session_start, -1) = 1 and neighbor(session_id, -1) = session_id) or - (neighbor(marked_session_start, -2) = 1 and neighbor(session_id, -2) = session_id) or - (neighbor(marked_session_start, -3) = 1 and neighbor(session_id, -3) = session_id) -""".format( - paths_query=paths_query_step_1 -) - -# Step 3. -# - Add event index per session -# - Use the index and path_type to create a path key (e.g. "1_/pricing", "2_/help") -# - Remove every unused row per session (5th and later rows) -# Those rows will only be there if many filter.start_point rows are in a query. -# For example start_point=/pricing and the user clicked back and forth between pricing and other pages. -paths_query_step_3 = """ - SELECT - person_id, - event_id, - timestamp, - path_type, - session_id, - (neighbor(session_id, -4) = session_id ? 5 : - (neighbor(session_id, -3) = session_id ? 4 : - (neighbor(session_id, -2) = session_id ? 3 : - (neighbor(session_id, -1) = session_id ? 2 : 1)))) as session_index, - concat(toString(session_index), '_', path_type) as path_key, - if(session_index > 1, neighbor(path_key, -1), null) AS last_path_key, - if(session_index > 1, neighbor(event_id, -1), null) AS last_event_id - FROM ({paths_query}) - WHERE - session_index <= 4 -""".format( - paths_query=paths_query_step_2 -) - -# Step 4. -# - Aggregate and get counts for unique pairs -# - Filter out the entry rows that come from "null" -PATHS_QUERY_FINAL = """ - SELECT - last_path_key as source_event, - any(last_event_id) as source_event_id, - path_key as target_event, - any(event_id) target_event_id, - COUNT(*) AS event_count - FROM ( - {paths_query} - ) - WHERE - source_event IS NOT NULL - AND target_event IS NOT NULL - GROUP BY - source_event, - target_event - ORDER BY - event_count DESC, - source_event, - target_event - LIMIT 20 -""".format( - paths_query=paths_query_step_3 -) diff --git a/ee/clickhouse/sql/person.py b/ee/clickhouse/sql/person.py index 7a6b55378069e..be03d66446560 100644 --- a/ee/clickhouse/sql/person.py +++ b/ee/clickhouse/sql/person.py @@ -1,27 +1,34 @@ from ee.kafka_client.topics import KAFKA_PERSON, KAFKA_PERSON_UNIQUE_ID +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE + +from .clickhouse import ( + COLLAPSING_MERGE_TREE, + KAFKA_COLUMNS, + REPLACING_MERGE_TREE, + STORAGE_POLICY, + kafka_engine, + table_engine, +) -from .clickhouse import KAFKA_COLUMNS, STORAGE_POLICY, kafka_engine, table_engine +TRUNCATE_PERSON_TABLE_SQL = f"TRUNCATE TABLE IF EXISTS person ON CLUSTER {CLICKHOUSE_CLUSTER}" -DROP_PERSON_TABLE_SQL = """ -DROP TABLE person -""" +DROP_PERSON_TABLE_SQL = f"DROP TABLE IF EXISTS person ON CLUSTER {CLICKHOUSE_CLUSTER}" -DROP_PERSON_DISTINCT_ID_TABLE_SQL = """ -DROP TABLE person_distinct_id -""" +TRUNCATE_PERSON_DISTINCT_ID_TABLE_SQL = f"TRUNCATE TABLE IF EXISTS person_distinct_id ON CLUSTER {CLICKHOUSE_CLUSTER}" PERSONS_TABLE = "person" PERSONS_TABLE_BASE_SQL = """ -CREATE TABLE {table_name} +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} ( id UUID, created_at DateTime64, team_id Int64, properties VARCHAR, - is_identified Boolean + is_identified Boolean, + is_deleted Boolean DEFAULT 0 {extra_fields} -) ENGINE = {engine} +) ENGINE = {engine} """ PERSONS_TABLE_SQL = ( @@ -31,37 +38,57 @@ """ ).format( table_name=PERSONS_TABLE, - engine=table_engine(PERSONS_TABLE, "_timestamp"), + cluster=CLICKHOUSE_CLUSTER, + engine=table_engine(PERSONS_TABLE, "_timestamp", REPLACING_MERGE_TREE), extra_fields=KAFKA_COLUMNS, storage_policy=STORAGE_POLICY, ) KAFKA_PERSONS_TABLE_SQL = PERSONS_TABLE_BASE_SQL.format( - table_name="kafka_" + PERSONS_TABLE, engine=kafka_engine(KAFKA_PERSON), extra_fields="", + table_name="kafka_" + PERSONS_TABLE, cluster=CLICKHOUSE_CLUSTER, engine=kafka_engine(KAFKA_PERSON), extra_fields="", ) +# You must include the database here because of a bug in clickhouse +# related to https://github.com/ClickHouse/ClickHouse/issues/10471 PERSONS_TABLE_MV_SQL = """ -CREATE MATERIALIZED VIEW {table_name}_mv -TO {table_name} +CREATE MATERIALIZED VIEW {table_name}_mv ON CLUSTER {cluster} +TO {database}.{table_name} AS SELECT id, created_at, team_id, properties, is_identified, +is_deleted, _timestamp, _offset -FROM kafka_{table_name} +FROM {database}.kafka_{table_name} """.format( - table_name=PERSONS_TABLE + table_name=PERSONS_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE, ) GET_LATEST_PERSON_SQL = """ SELECT * FROM person JOIN ( - SELECT id, max(created_at) as created_at FROM person WHERE team_id = %(team_id)s GROUP BY id -) as person_max ON person.id = person_max.id AND person.created_at = person_max.created_at + SELECT id, max(_timestamp) as _timestamp, max(is_deleted) as is_deleted + FROM person + WHERE team_id = %(team_id)s + GROUP BY id +) as person_max ON person.id = person_max.id AND person._timestamp = person_max._timestamp WHERE team_id = %(team_id)s -{query} + AND person_max.is_deleted = 0 + {query} +""" + +GET_TEAM_PERSON_DISTINCT_IDS = """ +SELECT distinct_id, argMax(person_id, _timestamp) as person_id +FROM ( + SELECT distinct_id, person_id, max(_timestamp) as _timestamp + FROM person_distinct_id + WHERE team_id = %(team_id)s + GROUP BY person_id, distinct_id, team_id + HAVING max(is_deleted) = 0 +) +GROUP BY distinct_id """ GET_LATEST_PERSON_ID_SQL = """ @@ -72,189 +99,221 @@ latest_person_sql=GET_LATEST_PERSON_SQL ) -GET_PERSON_SQL = """ -SELECT * FROM ({latest_person_sql}) person WHERE team_id = %(team_id)s -""".format( - latest_person_sql=GET_LATEST_PERSON_SQL -) - PERSONS_DISTINCT_ID_TABLE = "person_distinct_id" PERSONS_DISTINCT_ID_TABLE_BASE_SQL = """ -CREATE TABLE {table_name} +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} ( - id Int64, distinct_id VARCHAR, person_id UUID, - team_id Int64 + team_id Int64, + _sign Int8 DEFAULT 1, + is_deleted Int8 ALIAS if(_sign==-1, 1, 0) {extra_fields} -) ENGINE = {engine} +) ENGINE = {engine} """ PERSONS_DISTINCT_ID_TABLE_SQL = ( PERSONS_DISTINCT_ID_TABLE_BASE_SQL - + """Order By (team_id, distinct_id, person_id, id) + + """Order By (team_id, distinct_id, person_id) {storage_policy} """ ).format( table_name=PERSONS_DISTINCT_ID_TABLE, - engine=table_engine(PERSONS_DISTINCT_ID_TABLE, "_timestamp"), + cluster=CLICKHOUSE_CLUSTER, + engine=table_engine(PERSONS_DISTINCT_ID_TABLE, "_sign", COLLAPSING_MERGE_TREE), extra_fields=KAFKA_COLUMNS, storage_policy=STORAGE_POLICY, ) -KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL = PERSONS_DISTINCT_ID_TABLE_BASE_SQL.format( - table_name="kafka_" + PERSONS_DISTINCT_ID_TABLE, engine=kafka_engine(KAFKA_PERSON_UNIQUE_ID), extra_fields="", +# :KLUDGE: We default is_deleted to 0 for backwards compatibility for when we drop `is_deleted` from message schema. +# Can't make DEFAULT if(_sign==-1, 1, 0) because Cyclic aliases error. +KAFKA_PERSONS_DISTINCT_ID_TABLE_SQL = """ +CREATE TABLE {table_name} ON CLUSTER {cluster} +( + distinct_id VARCHAR, + person_id UUID, + team_id Int64, + _sign Nullable(Int8), + is_deleted Nullable(Int8) +) ENGINE = {engine} +""".format( + table_name="kafka_" + PERSONS_DISTINCT_ID_TABLE, + cluster=CLICKHOUSE_CLUSTER, + engine=kafka_engine(KAFKA_PERSON_UNIQUE_ID), ) +# You must include the database here because of a bug in clickhouse +# related to https://github.com/ClickHouse/ClickHouse/issues/10471 PERSONS_DISTINCT_ID_TABLE_MV_SQL = """ -CREATE MATERIALIZED VIEW {table_name}_mv -TO {table_name} +CREATE MATERIALIZED VIEW {table_name}_mv ON CLUSTER {cluster} +TO {database}.{table_name} AS SELECT -id, distinct_id, person_id, team_id, +coalesce(_sign, if(is_deleted==0, 1, -1)) AS _sign, _timestamp, _offset -FROM kafka_{table_name} +FROM {database}.kafka_{table_name} """.format( - table_name=PERSONS_DISTINCT_ID_TABLE + table_name=PERSONS_DISTINCT_ID_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE, ) -GET_DISTINCT_IDS_SQL = """ -SELECT * FROM person_distinct_id WHERE team_id = %(team_id)s +# +# Static Cohort +# + +PERSON_STATIC_COHORT_TABLE = "person_static_cohort" +PERSON_STATIC_COHORT_BASE_SQL = """ +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} +( + id UUID, + person_id UUID, + cohort_id Int64, + team_id Int64 + {extra_fields} +) ENGINE = {engine} """ -GET_DISTINCT_IDS_SQL_BY_ID = """ -SELECT * FROM person_distinct_id WHERE team_id = %(team_id)s AND person_id = %(person_id)s +PERSON_STATIC_COHORT_TABLE_SQL = ( + PERSON_STATIC_COHORT_BASE_SQL + + """Order By (team_id, cohort_id, person_id, id) +{storage_policy} """ +).format( + table_name=PERSON_STATIC_COHORT_TABLE, + cluster=CLICKHOUSE_CLUSTER, + engine=table_engine(PERSON_STATIC_COHORT_TABLE, "_timestamp", REPLACING_MERGE_TREE), + storage_policy=STORAGE_POLICY, + extra_fields=KAFKA_COLUMNS, +) -GET_PERSON_IDS_BY_FILTER = """ -SELECT DISTINCT p.id -FROM ({latest_person_sql}) AS p -INNER JOIN ( - SELECT person_id, distinct_id - FROM person_distinct_id - WHERE team_id = %(team_id)s -) AS pid ON p.id = pid.person_id -WHERE team_id = %(team_id)s - {distinct_query} -""".format( - latest_person_sql=GET_LATEST_PERSON_SQL, distinct_query="{distinct_query}" +TRUNCATE_PERSON_STATIC_COHORT_TABLE_SQL = ( + f"TRUNCATE TABLE IF EXISTS {PERSON_STATIC_COHORT_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" ) -GET_PERSON_BY_DISTINCT_ID = """ -SELECT p.id +INSERT_PERSON_STATIC_COHORT = ( + f"INSERT INTO {PERSON_STATIC_COHORT_TABLE} (id, person_id, cohort_id, team_id, _timestamp) VALUES" +) + +# +# Other queries +# + +GET_PERSON_IDS_BY_FILTER = """ +SELECT DISTINCT p.id FROM ({latest_person_sql}) AS p -INNER JOIN ( - SELECT person_id, distinct_id - FROM person_distinct_id - WHERE team_id = %(team_id)s -) AS pid ON p.id = pid.person_id +INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS pdi ON p.id = pdi.person_id WHERE team_id = %(team_id)s - AND pid.distinct_id = %(distinct_id)s {distinct_query} """.format( - latest_person_sql=GET_LATEST_PERSON_SQL, distinct_query="{distinct_query}" + latest_person_sql=GET_LATEST_PERSON_SQL, + distinct_query="{distinct_query}", + GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, ) -GET_PERSONS_BY_DISTINCT_IDS = """ -SELECT - p.id, - p.created_at, - p.team_id, - p.properties, - p.is_identified, - groupArray(pid.distinct_id) as distinct_ids -FROM - person as p -INNER JOIN - person_distinct_id as pid on p.id = pid.person_id -WHERE - team_id = %(team_id)s - AND distinct_id IN (%(distinct_ids)s) -GROUP BY - p.id, - p.created_at, - p.team_id, - p.properties, - p.is_identified -""" - -PERSON_DISTINCT_ID_EXISTS_SQL = """ -SELECT count(*) FROM person_distinct_id -inner join ( - SELECT arrayJoin({}) as distinct_id - ) as id_params ON id_params.distinct_id = person_distinct_id.distinct_id -where person_distinct_id.team_id = %(team_id)s -""" - INSERT_PERSON_SQL = """ -INSERT INTO person SELECT %(id)s, %(created_at)s, %(team_id)s, %(properties)s, %(is_identified)s, now(), 0 +INSERT INTO person (id, created_at, team_id, properties, is_identified, _timestamp, _offset, is_deleted) SELECT %(id)s, %(created_at)s, %(team_id)s, %(properties)s, %(is_identified)s, %(_timestamp)s, 0, 0 """ INSERT_PERSON_DISTINCT_ID = """ -INSERT INTO person_distinct_id SELECT %(id)s, %(distinct_id)s, %(person_id)s, %(team_id)s, now(), 0 VALUES -""" - -UPDATE_PERSON_PROPERTIES = """ -ALTER TABLE person UPDATE properties = %(properties)s where id = %(id)s -""" - -UPDATE_PERSON_ATTACHED_DISTINCT_ID = """ -ALTER TABLE person_distinct_id UPDATE person_id = %(person_id)s where distinct_id = %(distinct_id)s +INSERT INTO person_distinct_id SELECT %(distinct_id)s, %(person_id)s, %(team_id)s, %(_sign)s, now(), 0 VALUES """ DELETE_PERSON_BY_ID = """ -ALTER TABLE person DELETE where id = %(id)s +INSERT INTO person (id, created_at, team_id, properties, is_identified, _timestamp, _offset, is_deleted) SELECT %(id)s, %(created_at)s, %(team_id)s, %(properties)s, %(is_identified)s, %(_timestamp)s, 0, 1 """ DELETE_PERSON_EVENTS_BY_ID = """ ALTER TABLE events DELETE -where distinct_id IN ( +WHERE distinct_id IN ( SELECT distinct_id FROM person_distinct_id WHERE person_id=%(id)s AND team_id = %(team_id)s ) AND team_id = %(team_id)s """ -DELETE_PERSON_DISTINCT_ID_BY_PERSON_ID = """ -ALTER TABLE person_distinct_id DELETE where person_id = %(id)s -""" - -UPDATE_PERSON_IS_IDENTIFIED = """ -ALTER TABLE person UPDATE is_identified = %(is_identified)s where id = %(id)s -""" - -PERSON_TREND_SQL = """ -SELECT DISTINCT distinct_id FROM events WHERE team_id = %(team_id)s {entity_filter} {filters} {parsed_date_from} {parsed_date_to} {person_filter} +INSERT_COHORT_ALL_PEOPLE_THROUGH_PERSON_ID = """ +INSERT INTO {cohort_table} SELECT generateUUIDv4(), id, %(cohort_id)s, %(team_id)s, %(_timestamp)s, 0 FROM ( + SELECT person_id as id FROM ({query}) +) """ -PEOPLE_THROUGH_DISTINCT_SQL = """ +PEOPLE_SQL = """ SELECT id, created_at, team_id, properties, is_identified, groupArray(distinct_id) FROM ( {latest_person_sql} ) as person INNER JOIN ( - SELECT DISTINCT person_id, distinct_id FROM person_distinct_id WHERE distinct_id IN ({content_sql}) AND team_id = %(team_id)s + SELECT person_id, distinct_id FROM ({GET_TEAM_PERSON_DISTINCT_IDS}) WHERE person_id IN ({content_sql}) ) as pdi ON person.id = pdi.person_id -WHERE team_id = %(team_id)s GROUP BY id, created_at, team_id, properties, is_identified -LIMIT 200 OFFSET %(offset)s +LIMIT 100 OFFSET %(offset)s """ -PEOPLE_SQL = """ -SELECT id, created_at, team_id, properties, is_identified, groupArray(distinct_id) FROM ( - {latest_person_sql} -) as person INNER JOIN ( - SELECT DISTINCT person_id, distinct_id FROM person_distinct_id WHERE person_id IN ({content_sql}) AND team_id = %(team_id)s -) as pdi ON person.id = pdi.person_id GROUP BY id, created_at, team_id, properties, is_identified -LIMIT 200 OFFSET %(offset)s +INSERT_COHORT_ALL_PEOPLE_SQL = """ +INSERT INTO {cohort_table} SELECT generateUUIDv4(), id, %(cohort_id)s, %(team_id)s, %(_timestamp)s, 0 FROM ( + SELECT id FROM ( + {latest_person_sql} + ) as person INNER JOIN ( + SELECT person_id, distinct_id FROM ({GET_TEAM_PERSON_DISTINCT_IDS}) WHERE person_id IN ({content_sql}) + ) as pdi ON person.id = pdi.person_id + WHERE team_id = %(team_id)s + GROUP BY id +) """ GET_DISTINCT_IDS_BY_PROPERTY_SQL = """ -SELECT distinct_id FROM person_distinct_id WHERE person_id IN +SELECT distinct_id +FROM ( + {GET_TEAM_PERSON_DISTINCT_IDS} +) +WHERE person_id IN ( SELECT id - FROM person - WHERE team_id = %(team_id)s {filters} -) AND team_id = %(team_id)s + FROM ( + SELECT id, argMax(properties, person._timestamp) as properties, max(is_deleted) as is_deleted + FROM person + WHERE team_id = %(team_id)s + GROUP BY id + HAVING is_deleted = 0 + ) + WHERE 1 = 1 {filters} +) +""".format( + filters="{filters}", GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, +) + +GET_DISTINCT_IDS_BY_PERSON_ID_FILTER = """ +SELECT distinct_id +FROM ({GET_TEAM_PERSON_DISTINCT_IDS}) +WHERE {filters} +""".format( + filters="{filters}", GET_TEAM_PERSON_DISTINCT_IDS=GET_TEAM_PERSON_DISTINCT_IDS, +) + +GET_PERSON_PROPERTIES_COUNT = """ +SELECT tupleElement(keysAndValues, 1) as key, count(*) as count +FROM person +ARRAY JOIN JSONExtractKeysAndValuesRaw(properties) as keysAndValues +WHERE team_id = %(team_id)s +GROUP BY tupleElement(keysAndValues, 1) +ORDER BY count DESC, key ASC +""" + +GET_PERSONS_FROM_EVENT_QUERY = """ +SELECT + person_id, + created_at, + team_id, + person_props, + is_identified, + arrayReduce('groupUniqArray', groupArray(distinct_id)) AS distinct_ids +FROM ({events_query}) +GROUP BY + person_id, + created_at, + team_id, + person_props, + is_identified +LIMIT %(limit)s +OFFSET %(offset)s """ diff --git a/ee/clickhouse/sql/plugin_log_entries.py b/ee/clickhouse/sql/plugin_log_entries.py new file mode 100644 index 0000000000000..8f8c68db63f04 --- /dev/null +++ b/ee/clickhouse/sql/plugin_log_entries.py @@ -0,0 +1,73 @@ +from ee.kafka_client.topics import KAFKA_PLUGIN_LOG_ENTRIES +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE +from posthog.tasks.delete_old_plugin_logs import TTL_WEEKS + +from .clickhouse import KAFKA_COLUMNS, REPLACING_MERGE_TREE, kafka_engine, table_engine, ttl_period + +PLUGIN_LOG_ENTRIES_TABLE = "plugin_log_entries" + +PLUGIN_LOG_ENTRIES_TABLE_BASE_SQL = """ +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} +( + id UUID, + team_id Int64, + plugin_id Int64, + plugin_config_id Int64, + timestamp DateTime64(6, 'UTC'), + source VARCHAR, + type VARCHAR, + message VARCHAR, + instance_id UUID + {extra_fields} +) ENGINE = {engine} +""" + +PLUGIN_LOG_ENTRIES_TABLE_SQL = ( + PLUGIN_LOG_ENTRIES_TABLE_BASE_SQL + + """PARTITION BY plugin_id ORDER BY (team_id, id) +{ttl_period} +SETTINGS index_granularity=512 +""" +).format( + table_name=PLUGIN_LOG_ENTRIES_TABLE, + cluster=CLICKHOUSE_CLUSTER, + extra_fields=KAFKA_COLUMNS, + engine=table_engine(PLUGIN_LOG_ENTRIES_TABLE, "_timestamp", REPLACING_MERGE_TREE), + ttl_period=ttl_period("timestamp", TTL_WEEKS), +) + +KAFKA_PLUGIN_LOG_ENTRIES_TABLE_SQL = PLUGIN_LOG_ENTRIES_TABLE_BASE_SQL.format( + table_name="kafka_" + PLUGIN_LOG_ENTRIES_TABLE, + cluster=CLICKHOUSE_CLUSTER, + engine=kafka_engine(topic=KAFKA_PLUGIN_LOG_ENTRIES), + extra_fields="", +) + +PLUGIN_LOG_ENTRIES_TABLE_MV_SQL = """ +CREATE MATERIALIZED VIEW {table_name}_mv ON CLUSTER {cluster} +TO {database}.{table_name} +AS SELECT +id, +team_id, +plugin_id, +plugin_config_id, +timestamp, +source, +type, +message, +instance_id, +_timestamp, +_offset +FROM {database}.kafka_{table_name} +""".format( + table_name=PLUGIN_LOG_ENTRIES_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE, +) + + +INSERT_PLUGIN_LOG_ENTRY_SQL = """ +INSERT INTO plugin_log_entries SELECT %(id)s, %(team_id)s, %(plugin_id)s, %(plugin_config_id)s, %(timestamp)s, %(source)s, %(type)s, %(message)s, %(instance_id)s, now(), 0 +""" + +TRUNCATE_PLUGIN_LOG_ENTRIES_TABLE_SQL = ( + f"TRUNCATE TABLE IF EXISTS {PLUGIN_LOG_ENTRIES_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" +) diff --git a/ee/clickhouse/sql/retention/people_in_period.py b/ee/clickhouse/sql/retention/people_in_period.py new file mode 100644 index 0000000000000..d40ad69a94333 --- /dev/null +++ b/ee/clickhouse/sql/retention/people_in_period.py @@ -0,0 +1,74 @@ +RETENTION_PEOPLE_PER_PERIOD_SQL = """ +SELECT toString(person_id), count(person_id) appearance_count, groupArray(intervals_from_base) appearances FROM ( + SELECT DISTINCT + datediff(%(period)s, {trunc_func}(toDateTime(%(start_date)s)), reference_event.event_date) as base_interval, + datediff(%(period)s, reference_event.event_date, {trunc_func}(toDateTime(event_date))) as intervals_from_base, + event.person_id + FROM ( + SELECT + timestamp AS event_date, + pdi.person_id as person_id, + e.uuid as uuid, + e.event as event + FROM events e join ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id + where toDateTime(e.timestamp) >= toDateTime(%(start_date)s) AND toDateTime(e.timestamp) <= toDateTime(%(end_date)s) + AND e.team_id = %(team_id)s {returning_query} {filters} + ) event + JOIN ( + {first_event_sql} + ) reference_event + ON (event.person_id = reference_event.person_id) + WHERE {trunc_func}(event.event_date) > {trunc_func}(reference_event.event_date) + UNION ALL + {first_event_default_sql} +) person_appearances +WHERE base_interval = 0 +GROUP BY person_id +ORDER BY appearance_count DESC +LIMIT %(limit)s OFFSET %(offset)s +""" + +REFERENCE_EVENT_PEOPLE_PER_PERIOD_SQL = """ +SELECT DISTINCT +{trunc_func}(e.timestamp) as event_date, +pdi.person_id as person_id, +e.uuid as uuid, +e.event as event +from events e JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +where event_date = {trunc_func}(toDateTime(%(start_date)s)) +AND e.team_id = %(team_id)s {target_query} {filters} +""" + + +DEFAULT_REFERENCE_EVENT_PEOPLE_PER_PERIOD_SQL = """ +SELECT DISTINCT +0, +0, +pdi.person_id as person_id +from events e JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +where {trunc_func}(e.timestamp) = {trunc_func}(toDateTime(%(start_date)s)) +AND e.team_id = %(team_id)s {target_query} {filters} +""" + +REFERENCE_EVENT_UNIQUE_PEOPLE_PER_PERIOD_SQL = """ +SELECT DISTINCT +min({trunc_func}(e.timestamp)) as event_date, +pdi.person_id as person_id, +argMin(e.uuid, {trunc_func}(e.timestamp)) as min_uuid, +argMin(e.event, {trunc_func}(e.timestamp)) as min_event +from events e JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +WHERE e.team_id = %(team_id)s {target_query} {filters} +GROUP BY person_id HAVING +event_date = {trunc_func}(toDateTime(%(start_date)s)) +""" + +DEFAULT_REFERENCE_EVENT_UNIQUE_PEOPLE_PER_PERIOD_SQL = """ +SELECT DISTINCT +0, +0, +pdi.person_id as person_id +from events e JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +WHERE e.team_id = %(team_id)s {target_query} {filters} +GROUP BY person_id HAVING +min({trunc_func}(e.timestamp)) = {trunc_func}(toDateTime(%(start_date)s)) +""" diff --git a/ee/clickhouse/sql/retention/retention.py b/ee/clickhouse/sql/retention/retention.py index 27535e6442a88..db5e2210f768f 100644 --- a/ee/clickhouse/sql/retention/retention.py +++ b/ee/clickhouse/sql/retention/retention.py @@ -1,43 +1,57 @@ -# Would love a clickhouse CTE right about here - RETENTION_SQL = """ SELECT - datediff(%(period)s, {trunc_func}(toDateTime(%(start_date)s)), reference_event.event_date) as period_to_event_days, - datediff(%(period)s, reference_event.event_date, {trunc_func}(toDateTime(event_date))) as period_between_events_days, - COUNT(DISTINCT event.person_id) count + datediff(%(period)s, {trunc_func}(toDateTime(%(start_date)s)), reference_event.event_date) as base_interval, + datediff(%(period)s, reference_event.event_date, {trunc_func}(toDateTime(event_date))) as intervals_from_base, + COUNT(DISTINCT event.target) count FROM ( - SELECT - timestamp AS event_date, - pdi.person_id as person_id - FROM events e join (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) pdi on e.distinct_id = pdi.distinct_id - where toDateTime(e.timestamp) >= toDateTime(%(start_date)s) AND toDateTime(e.timestamp) <= toDateTime(%(end_date)s) - AND e.team_id = %(team_id)s {returning_query} {filters} - {extra_union} + {returning_event_query} ) event JOIN ( - {reference_event_sql} + {target_event_query} ) reference_event - ON (event.person_id = reference_event.person_id) -WHERE {trunc_func}(event.event_date) >= {trunc_func}(reference_event.event_date) -GROUP BY period_to_event_days, period_between_events_days -ORDER BY period_to_event_days, period_between_events_days + ON (event.target = reference_event.target) +WHERE {trunc_func}(event.event_date) > {trunc_func}(reference_event.event_date) +GROUP BY base_interval, intervals_from_base +ORDER BY base_interval, intervals_from_base """ REFERENCE_EVENT_SQL = """ -SELECT DISTINCT +SELECT DISTINCT {trunc_func}(e.timestamp) as event_date, -pdi.person_id as person_id -from events e JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) pdi on e.distinct_id = pdi.distinct_id -where toDateTime(e.timestamp) >= toDateTime(%(start_date)s) AND toDateTime(e.timestamp) <= toDateTime(%(end_date)s) +pdi.person_id as person_id, +e.uuid as uuid, +e.event as event +from events e JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +where toDateTime(e.timestamp) >= toDateTime(%(reference_start_date)s) AND toDateTime(e.timestamp) <= toDateTime(%(reference_end_date)s) AND e.team_id = %(team_id)s {target_query} {filters} """ REFERENCE_EVENT_UNIQUE_SQL = """ -SELECT DISTINCT +SELECT DISTINCT min({trunc_func}(e.timestamp)) as event_date, -pdi.person_id as person_id -from events e JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) pdi on e.distinct_id = pdi.distinct_id -WHERE e.team_id = %(team_id)s {target_query} {filters} +pdi.person_id as person_id, +argMin(e.uuid, {trunc_func}(e.timestamp)) as min_uuid, +argMin(e.event, {trunc_func}(e.timestamp)) as min_event +from events e JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +WHERE e.team_id = %(team_id)s {target_query} {filters} GROUP BY person_id HAVING -event_date >= toDateTime(%(start_date)s) AND event_date <= toDateTime(%(end_date)s) +event_date >= toDateTime(%(reference_start_date)s) AND event_date <= toDateTime(%(reference_end_date)s) +""" + + +RETENTION_PEOPLE_SQL = """ +SELECT DISTINCT person_id +FROM events e join ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on e.distinct_id = pdi.distinct_id +where toDateTime(e.timestamp) >= toDateTime(%(start_date)s) AND toDateTime(e.timestamp) <= toDateTime(%(end_date)s) +AND e.team_id = %(team_id)s AND person_id IN ( + SELECT person_id FROM ({reference_event_query}) as persons +) {target_query} {filters} +LIMIT 100 OFFSET %(offset)s +""" + +INITIAL_INTERVAL_SQL = """ +SELECT datediff(%(period)s, {trunc_func}(toDateTime(%(start_date)s)), event_date) event_date, + count(DISTINCT target) FROM ( + {reference_event_sql} +) GROUP BY event_date ORDER BY event_date """ diff --git a/ee/clickhouse/sql/session_recording_events.py b/ee/clickhouse/sql/session_recording_events.py index e8b90bf46f6eb..1974871e3f364 100644 --- a/ee/clickhouse/sql/session_recording_events.py +++ b/ee/clickhouse/sql/session_recording_events.py @@ -1,17 +1,19 @@ from ee.kafka_client.topics import KAFKA_SESSION_RECORDING_EVENTS +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE -from .clickhouse import KAFKA_COLUMNS, STORAGE_POLICY, kafka_engine, table_engine, ttl_period +from .clickhouse import KAFKA_COLUMNS, REPLACING_MERGE_TREE, kafka_engine, table_engine, ttl_period SESSION_RECORDING_EVENTS_TABLE = "session_recording_events" SESSION_RECORDING_EVENTS_TABLE_BASE_SQL = """ -CREATE TABLE {table_name} +CREATE TABLE IF NOT EXISTS {table_name} ON CLUSTER {cluster} ( uuid UUID, timestamp DateTime64(6, 'UTC'), team_id Int64, distinct_id VARCHAR, session_id VARCHAR, + window_id VARCHAR, snapshot_data VARCHAR, created_at DateTime64(6, 'UTC') {extra_fields} @@ -27,38 +29,43 @@ """ ).format( table_name=SESSION_RECORDING_EVENTS_TABLE, + cluster=CLICKHOUSE_CLUSTER, extra_fields=KAFKA_COLUMNS, - engine=table_engine(SESSION_RECORDING_EVENTS_TABLE, "_timestamp"), + engine=table_engine(SESSION_RECORDING_EVENTS_TABLE, "_timestamp", REPLACING_MERGE_TREE), ttl_period=ttl_period(), ) KAFKA_SESSION_RECORDING_EVENTS_TABLE_SQL = SESSION_RECORDING_EVENTS_TABLE_BASE_SQL.format( table_name="kafka_" + SESSION_RECORDING_EVENTS_TABLE, + cluster=CLICKHOUSE_CLUSTER, engine=kafka_engine(topic=KAFKA_SESSION_RECORDING_EVENTS), extra_fields="", ) SESSION_RECORDING_EVENTS_TABLE_MV_SQL = """ -CREATE MATERIALIZED VIEW {table_name}_mv -TO {table_name} +CREATE MATERIALIZED VIEW {table_name}_mv ON CLUSTER {cluster} +TO {database}.{table_name} AS SELECT uuid, timestamp, team_id, distinct_id, session_id, +window_id, snapshot_data, created_at, _timestamp, _offset -FROM kafka_{table_name} +FROM {database}.kafka_{table_name} """.format( - table_name=SESSION_RECORDING_EVENTS_TABLE + table_name=SESSION_RECORDING_EVENTS_TABLE, cluster=CLICKHOUSE_CLUSTER, database=CLICKHOUSE_DATABASE, ) INSERT_SESSION_RECORDING_EVENT_SQL = """ -INSERT INTO session_recording_events SELECT %(uuid)s, %(timestamp)s, %(team_id)s, %(distinct_id)s, %(session_id)s, %(snapshot_data)s, %(created_at)s, now(), 0 +INSERT INTO session_recording_events SELECT %(uuid)s, %(timestamp)s, %(team_id)s, %(distinct_id)s, %(session_id)s, %(window_id)s, %(snapshot_data)s, %(created_at)s, now(), 0 """ -DROP_SESSION_RECORDING_EVENTS_TABLE_SQL = "DROP TABLE session_recording_events" +TRUNCATE_SESSION_RECORDING_EVENTS_TABLE_SQL = ( + f"TRUNCATE TABLE IF EXISTS {SESSION_RECORDING_EVENTS_TABLE} ON CLUSTER {CLICKHOUSE_CLUSTER}" +) diff --git a/ee/clickhouse/sql/sessions/list.py b/ee/clickhouse/sql/sessions/list.py index b0f4270ab5620..255c83545e0c4 100644 --- a/ee/clickhouse/sql/sessions/list.py +++ b/ee/clickhouse/sql/sessions/list.py @@ -1,15 +1,26 @@ +SESSIONS_DISTINCT_ID_SQL = """ + SELECT distinct distinct_id + FROM + events + WHERE team_id = %(team_id)s + {date_from} + {date_to} + {person_filters} + {action_filters} + ORDER BY timestamp DESC + LIMIT %(distinct_id_limit)s +""" + SESSION_SQL = """ SELECT distinct_id, gid, dateDiff('second', toDateTime(arrayReduce('min', groupArray(timestamp))), toDateTime(arrayReduce('max', groupArray(timestamp)))) AS elapsed, arrayReduce('min', groupArray(timestamp)) as start_time, - groupArray(uuid) uuids, - groupArray(event) events, - groupArray(properties) properties, - groupArray(timestamp) timestamps, - groupArray(elements_chain) elements_chain, - arrayReduce('max', groupArray(timestamp)) as end_time + arrayReduce('max', groupArray(timestamp)) as end_time, + JSONExtractString(arrayElement(groupArray(properties), 1), '$current_url') as start_url, + JSONExtractString(arrayElement(groupArray(properties), -1), '$current_url') as end_url + {filters_select_clause} FROM ( SELECT distinct_id, @@ -19,6 +30,7 @@ properties, elements_chain, arraySum(arraySlice(gids, 1, idx)) AS gid + {matches_action_clauses} FROM ( SELECT groupArray(timestamp) as timestamps, @@ -54,17 +66,7 @@ AND event != '$feature_flag_called' {date_from} {date_to} - {filters} - AND distinct_id IN ( - SELECT distinct distinct_id - FROM - events - WHERE team_id = %(team_id)s - {date_from} - {date_to} - ORDER BY timestamp DESC - LIMIT %(distinct_id_limit)s - ) + AND distinct_id IN %(distinct_ids)s GROUP BY uuid, event, @@ -90,7 +92,24 @@ GROUP BY distinct_id, gid + {filters_having} ORDER BY - start_time DESC + end_time DESC {sessions_limit} """ + +SESSION_EVENTS = """ +SELECT + uuid, + event, + properties, + timestamp, + elements_chain +FROM events +WHERE team_id = %(team_id)s + AND event != '$feature_flag_called' + AND distinct_id = %(distinct_id)s + {date_from} + {date_to} +ORDER BY timestamp +""" diff --git a/ee/clickhouse/sql/sessions/no_events.py b/ee/clickhouse/sql/sessions/no_events.py index e8cc98b5a2ff7..f28c7452ca6c7 100644 --- a/ee/clickhouse/sql/sessions/no_events.py +++ b/ee/clickhouse/sql/sessions/no_events.py @@ -1,27 +1,17 @@ SESSIONS_NO_EVENTS_SQL = """ SELECT - distinct_id, - uuid, - session_uuid, session_duration_seconds, - timestamp, - session_end_ts + timestamp FROM ( SELECT - distinct_id, - uuid, - if(is_new_session, uuid, NULL) AS session_uuid, is_new_session, is_end_session, if(is_end_session AND is_new_session, 0, if(is_new_session AND (NOT is_end_session), dateDiff('second', toDateTime(timestamp), toDateTime(neighbor(timestamp, 1))), NULL)) AS session_duration_seconds, - timestamp, - if(is_end_session AND is_new_session, timestamp, if(is_new_session AND (NOT is_end_session), neighbor(timestamp, 1), NULL)) AS session_end_ts + timestamp FROM ( SELECT - distinct_id, - uuid, timestamp, neighbor(distinct_id, -1) AS start_possible_neighbor, neighbor(timestamp, -1) AS start_possible_prev_ts, @@ -32,18 +22,16 @@ FROM ( SELECT - uuid, timestamp, distinct_id FROM events WHERE team_id = %(team_id)s - AND event != '$feature_flag_called' + {entity_filter} {date_from} {date_to} {filters} GROUP BY - uuid, timestamp, distinct_id ORDER BY diff --git a/ee/clickhouse/sql/stickiness/stickiness.py b/ee/clickhouse/sql/stickiness/stickiness.py index d4b7fd2614ec2..c3025b9526597 100644 --- a/ee/clickhouse/sql/stickiness/stickiness.py +++ b/ee/clickhouse/sql/stickiness/stickiness.py @@ -1,9 +1,12 @@ STICKINESS_SQL = """ - SELECT countDistinct(person_id), day_count FROM ( - SELECT person_distinct_id.person_id, countDistinct(toDate(timestamp)) as day_count + SELECT countDistinct(person_id), num_intervals FROM ( + SELECT person_distinct_id.person_id, countDistinct({trunc_func}(toDateTime(timestamp))) as num_intervals FROM events - LEFT JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id - WHERE team_id = {team_id} AND event = '{event}' {filters} {parsed_date_from} {parsed_date_to} + LEFT JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) as person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id + WHERE team_id = %(team_id)s AND event = '{event}' {filters} {parsed_date_from} {parsed_date_to} GROUP BY person_distinct_id.person_id - ) GROUP BY day_count ORDER BY day_count + ) + WHERE num_intervals <= %(num_intervals)s + GROUP BY num_intervals + ORDER BY num_intervals """ diff --git a/ee/clickhouse/sql/stickiness/stickiness_actions.py b/ee/clickhouse/sql/stickiness/stickiness_actions.py index 22bed12e8d617..923203142e834 100644 --- a/ee/clickhouse/sql/stickiness/stickiness_actions.py +++ b/ee/clickhouse/sql/stickiness/stickiness_actions.py @@ -1,9 +1,12 @@ STICKINESS_ACTIONS_SQL = """ - SELECT countDistinct(person_id), day_count FROM ( - SELECT person_distinct_id.person_id, countDistinct(toDate(timestamp)) as day_count + SELECT countDistinct(person_id), num_intervals FROM ( + SELECT person_distinct_id.person_id, countDistinct({trunc_func}(toDateTime(timestamp))) as num_intervals FROM events - LEFT JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id + LEFT JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) as person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id WHERE team_id = %(team_id)s AND {actions_query} {filters} {parsed_date_from} {parsed_date_to} GROUP BY person_distinct_id.person_id - ) GROUP BY day_count ORDER BY day_count + ) + WHERE num_intervals <= %(num_intervals)s + GROUP BY num_intervals + ORDER BY num_intervals """ diff --git a/ee/clickhouse/sql/stickiness/stickiness_people.py b/ee/clickhouse/sql/stickiness/stickiness_people.py index a142254993626..47b728e00853f 100644 --- a/ee/clickhouse/sql/stickiness/stickiness_people.py +++ b/ee/clickhouse/sql/stickiness/stickiness_people.py @@ -1,9 +1,9 @@ STICKINESS_PEOPLE_SQL = """ -SELECT DISTINCT pid FROM ( - SELECT DISTINCT person_distinct_id.person_id as pid, countDistinct(toDate(timestamp)) as day_count +SELECT DISTINCT pdi FROM ( + SELECT DISTINCT person_distinct_id.person_id AS pdi, countDistinct({trunc_func}(toDateTime(timestamp))) AS num_intervals FROM events - LEFT JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id + LEFT JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) AS person_distinct_id ON person_distinct_id.distinct_id = events.distinct_id WHERE team_id = %(team_id)s {entity_filter} {filters} {parsed_date_from} {parsed_date_to} GROUP BY person_distinct_id.person_id -) WHERE day_count = %(stickiness_day)s +) WHERE num_intervals = %(stickiness_day)s """ diff --git a/ee/clickhouse/sql/trends/aggregate.py b/ee/clickhouse/sql/trends/aggregate.py index db31e5661d633..aaf1ced873c87 100644 --- a/ee/clickhouse/sql/trends/aggregate.py +++ b/ee/clickhouse/sql/trends/aggregate.py @@ -1,5 +1,5 @@ AGGREGATE_SQL = """ -SELECT groupArray(day_start), groupArray(count) FROM ( +SELECT groupArray(day_start) as date, groupArray(count) as data FROM ( SELECT SUM(total) AS count, day_start from ({null_sql} UNION ALL {content_sql}) group by day_start order by day_start ) """ diff --git a/ee/clickhouse/sql/trends/breakdown.py b/ee/clickhouse/sql/trends/breakdown.py index 8f501803ac213..14969788accfa 100644 --- a/ee/clickhouse/sql/trends/breakdown.py +++ b/ee/clickhouse/sql/trends/breakdown.py @@ -1,82 +1,116 @@ BREAKDOWN_QUERY_SQL = """ -SELECT groupArray(day_start), groupArray(count), breakdown_value FROM ( +SELECT groupArray(day_start) as date, groupArray(count) as data, breakdown_value FROM ( SELECT SUM(total) as count, day_start, breakdown_value FROM ( SELECT * FROM ( - {null_sql} as main + -- Create a table with 1 row for each interval for the requested date range + -- This acts as a method of zero filling, i.e. when there are no data points + -- for a given interval, we'll still have a row for the group by interval with + -- a 0 value. + -- + -- It's essentially a cross product of graph "ticks" and breakdown values. + -- + -- TODO: we're relying on num_intervals, seconds_int_interval etc. being passed + -- in as a parameter. To reduce the coupling between here and the + -- calling code, we could perform calculations for these within the query + -- itself based on date_to/date_from. We could also pass in the intervals + -- explicitly, although we'll be relying on the date handling between python + -- and ClickHouse to be the same. + -- + -- NOTE: there is the ORDER BY ... WITH FILL Expression but I'm not sure how we'd + -- handle the edge cases: + -- + -- https://clickhouse.com/docs/en/sql-reference/statements/select/order-by/#orderby-with-fill + -- + + SELECT + toUInt16(0) AS total, + ticks.day_start as day_start, + breakdown_value + + FROM ( + -- Generates all the intervals/ticks in the date range + -- NOTE: we build this range by including successive intervals back from the + -- upper bound, then including the lower bound in the query also. + + SELECT + {interval}( + toDateTime(%(date_to)s) - number * %(seconds_in_interval)s + ) as day_start + FROM numbers({num_intervals}) + UNION ALL + SELECT {interval}(toDateTime(%(date_from)s)) as day_start + ) as ticks + + -- Zero fill for all values for the specified breakdown CROSS JOIN ( SELECT breakdown_value FROM ( SELECT %(values)s as breakdown_value - ) ARRAY JOIN breakdown_value + ) ARRAY JOIN breakdown_value ) as sec ORDER BY breakdown_value, day_start - UNION ALL - SELECT {aggregate_operation} as total, toDateTime({interval_annotation}(timestamp), 'UTC') as day_start, value as breakdown_value - FROM - events e {event_join} {breakdown_filter} - GROUP BY day_start, breakdown_value + UNION ALL + {inner_sql} ) - ) + ) GROUP BY day_start, breakdown_value ORDER BY breakdown_value, day_start ) GROUP BY breakdown_value """ -BREAKDOWN_DEFAULT_SQL = """ -SELECT groupArray(day_start), groupArray(count) FROM ( - SELECT SUM(total) as count, day_start FROM ( - SELECT * FROM ( - {null_sql} as main - ORDER BY day_start - UNION ALL - SELECT {aggregate_operation} as total, toDateTime({interval_annotation}(timestamp), 'UTC') as day_start - FROM - events e {event_join} {breakdown_filter} - GROUP BY day_start - ) - ) - GROUP BY day_start - ORDER BY day_start -) +BREAKDOWN_INNER_SQL = """ +SELECT + {aggregate_operation} as total, + toDateTime({interval_annotation}(timestamp), 'UTC') as day_start, + {breakdown_value} as breakdown_value +FROM events e +{person_join} +{groups_join} +{breakdown_filter} +GROUP BY day_start, breakdown_value """ -BREAKDOWN_CONDITIONS_SQL = """ -WHERE team_id = %(team_id)s {event_filter} {filters} {parsed_date_from} {parsed_date_to} {actions_query} +BREAKDOWN_ACTIVE_USER_INNER_SQL = """ +SELECT counts as total, timestamp as day_start, breakdown_value +FROM ( + SELECT d.timestamp, COUNT(DISTINCT person_id) counts, breakdown_value FROM ( + SELECT toStartOfDay(timestamp) as timestamp FROM events e WHERE team_id = %(team_id)s {parsed_date_from_prev_range} {parsed_date_to} GROUP BY timestamp + ) d + CROSS JOIN ( + SELECT toStartOfDay(timestamp) as timestamp, person_id, {breakdown_value} as breakdown_value + FROM events e + INNER JOIN ({GET_TEAM_PERSON_DISTINCT_IDS}) as pdi + ON e.distinct_id = pdi.distinct_id + {person_join} + {groups_join} + {conditions} + GROUP BY timestamp, person_id, breakdown_value + ) e + WHERE e.timestamp <= d.timestamp AND e.timestamp > d.timestamp - INTERVAL {prev_interval} + GROUP BY d.timestamp, breakdown_value + ORDER BY d.timestamp +) WHERE 11111 = 11111 {parsed_date_from} {parsed_date_to} """ -BREAKDOWN_PERSON_PROP_JOIN_SQL = """ -INNER JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as pid ON e.distinct_id = pid.distinct_id -INNER JOIN ( - SELECT * FROM ( - SELECT - id, - array_property_keys as key, - array_property_values as value - from ( - SELECT - id, - arrayMap(k -> toString(k.1), JSONExtractKeysAndValuesRaw(properties)) AS array_property_keys, - arrayMap(k -> toString(k.2), JSONExtractKeysAndValuesRaw(properties)) AS array_property_values - FROM ({latest_person_sql}) person WHERE team_id = %(team_id)s - ) - ARRAY JOIN array_property_keys, array_property_values - ) ep - WHERE key = %(key)s -) ep -ON person_id = ep.id WHERE e.team_id = %(team_id)s {event_filter} {filters} {parsed_date_from} {parsed_date_to} -AND breakdown_value in (%(values)s) {actions_query} + +BREAKDOWN_AGGREGATE_QUERY_SQL = """ +SELECT {aggregate_operation} AS total, {breakdown_value} AS breakdown_value +FROM events e +{person_join} +{groups_join} +{breakdown_filter} +GROUP BY breakdown_value """ +BREAKDOWN_ACTIVE_USER_CONDITIONS_SQL = """ +WHERE e.team_id = %(team_id)s {event_filter} {filters} {parsed_date_from_prev_range} {parsed_date_to} {actions_query} +""" BREAKDOWN_PROP_JOIN_SQL = """ -INNER JOIN ( - SELECT * - FROM events_properties_view AS ep - WHERE key = %(key)s and team_id = %(team_id)s -) ep -ON uuid = ep.event_id where e.team_id = %(team_id)s {event_filter} {filters} {parsed_date_from} {parsed_date_to} -AND breakdown_value in (%(values)s) {actions_query} +WHERE e.team_id = %(team_id)s {event_filter} {filters} {parsed_date_from} {parsed_date_to} + AND {breakdown_value_expr} in (%(values)s) + {actions_query} """ BREAKDOWN_COHORT_JOIN_SQL = """ diff --git a/ee/clickhouse/sql/trends/lifecycle.py b/ee/clickhouse/sql/trends/lifecycle.py new file mode 100644 index 0000000000000..0f7d817317f46 --- /dev/null +++ b/ee/clickhouse/sql/trends/lifecycle.py @@ -0,0 +1,156 @@ +LIFECYCLE_SQL = """ +SELECT groupArray(day_start) as date, groupArray(counts) as data, status FROM ( + SELECT if(status = 'dormant', toInt64(SUM(counts)) * toInt16(-1), toInt64(SUM(counts))) as counts, day_start, status + FROM ( + SELECT ticks.day_start as day_start, toUInt16(0) AS counts, status + + FROM ( + -- Generates all the intervals/ticks in the date range + -- NOTE: we build this range by including successive intervals back from the + -- upper bound, then including the lower bound in the query also. + + SELECT + {trunc_func}( + toDateTime(%(date_to)s) - number * %(seconds_in_interval)s + ) as day_start + FROM numbers(%(num_intervals)s) + UNION ALL + SELECT {trunc_func}(toDateTime(%(date_from)s)) as day_start + ) as ticks + + CROSS JOIN ( + SELECT status + FROM ( + SELECT ['new', 'returning', 'resurrecting', 'dormant'] as status + ) ARRAY JOIN status + ) as sec + ORDER BY status, day_start + + UNION ALL + + SELECT subsequent_day, count(DISTINCT person_id) counts, status FROM ( + SELECT *, if(base_day = toDateTime('0000-00-00 00:00:00'), 'dormant', if(subsequent_day = base_day + INTERVAL {interval}, 'returning', if(subsequent_day > earliest + INTERVAL {interval}, 'resurrecting', 'new'))) as status FROM ( + SELECT person_id, base_day, min(subsequent_day) as subsequent_day FROM ( + SELECT person_id, day as base_day, events.subsequent_day as subsequent_day FROM ( + SELECT DISTINCT person_id, {trunc_func}(events.timestamp) day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id, day HAVING day <= toDateTime(%(date_to)s) AND day >= toDateTime(%(prev_date_from)s) + ) base + JOIN ( + SELECT DISTINCT person_id, {trunc_func}(events.timestamp) subsequent_day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id, subsequent_day HAVING subsequent_day <= toDateTime(%(date_to)s) AND subsequent_day >= toDateTime(%(prev_date_from)s) + ) events ON base.person_id = events.person_id + WHERE subsequent_day > base_day + ) + GROUP BY person_id, base_day + UNION ALL + SELECT person_id, min(day) as base_day, min(day) as subsequent_day FROM ( + SELECT DISTINCT person_id, {trunc_func}(events.timestamp) day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id, day HAVING day <= toDateTime(%(date_to)s) AND day >= toDateTime(%(prev_date_from)s) + ) base + GROUP BY person_id + UNION ALL + SELECT person_id, base_day, subsequent_day FROM ( + SELECT person_id, total as base_day, day_start as subsequent_day FROM ( + SELECT DISTINCT person_id, groupArray({trunc_func}(events.timestamp)) day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + AND toDateTime(events.timestamp) <= toDateTime(%(date_to)s) AND {trunc_func}(events.timestamp) >= toDateTime(%(date_from)s) + GROUP BY person_id + ) as e + CROSS JOIN ( + SELECT toDateTime('0000-00-00 00:00:00') AS total, {trunc_func}(toDateTime(%(date_to)s) - number * %(seconds_in_interval)s) as day_start from numbers(%(num_intervals)s) + ) as b WHERE has(day, subsequent_day) = 0 + ORDER BY person_id, subsequent_day ASC + ) WHERE + ((empty(toString(neighbor(person_id, -1))) OR neighbor(person_id, -1) != person_id) AND subsequent_day != {trunc_func}(toDateTime(%(date_from)s) + INTERVAL {interval} - INTERVAL {sub_interval})) + OR + ( (neighbor(person_id, -1) = person_id) AND neighbor(subsequent_day, -1) < subsequent_day - INTERVAL {interval}) + ) e + JOIN ( + SELECT DISTINCT person_id, {trunc_func}(min(events.timestamp)) earliest FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id + ) earliest ON e.person_id = earliest.person_id + ) + WHERE subsequent_day <= toDateTime(%(date_to)s) AND subsequent_day >= toDateTime(%(date_from)s) + GROUP BY subsequent_day, status + ) + GROUP BY day_start, status + ORDER BY day_start ASC +) +GROUP BY status +""" + +LIFECYCLE_PEOPLE_SQL = """ +SELECT person_id FROM ( + SELECT *, if(base_day = toDateTime('0000-00-00 00:00:00'), 'dormant', if(subsequent_day = base_day + INTERVAL {interval}, 'returning', if(subsequent_day > earliest + INTERVAL {interval}, 'resurrecting', 'new'))) as status FROM ( + SELECT person_id, base_day, min(subsequent_day) as subsequent_day FROM ( + SELECT person_id, day as base_day, events.subsequent_day as subsequent_day FROM ( + SELECT DISTINCT person_id, {trunc_func}(events.timestamp) day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id, day HAVING day <= toDateTime(%(date_to)s) AND day >= toDateTime(%(prev_date_from)s) + ) base + JOIN ( + SELECT DISTINCT person_id, {trunc_func}(events.timestamp) subsequent_day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id, subsequent_day HAVING subsequent_day <= toDateTime(%(date_to)s) AND subsequent_day >= toDateTime(%(prev_date_from)s) + ) events ON base.person_id = events.person_id + WHERE subsequent_day > base_day + ) + GROUP BY person_id, base_day + UNION ALL + SELECT person_id, min(day) as base_day, min(day) as subsequent_day FROM ( + SELECT DISTINCT person_id, {trunc_func}(events.timestamp) day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id, day HAVING day <= toDateTime(%(date_to)s) AND day >= toDateTime(%(prev_date_from)s) + ) base + GROUP BY person_id + UNION ALL + SELECT person_id, base_day, subsequent_day FROM ( + SELECT person_id, dummy as base_day, day_start as subsequent_day FROM ( + SELECT DISTINCT person_id, groupArray({trunc_func}(events.timestamp)) day FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + AND toDateTime(events.timestamp) <= toDateTime(%(date_to)s) AND {trunc_func}(events.timestamp) >= toDateTime(%(date_from)s) + GROUP BY person_id + ) as e + CROSS JOIN ( + SELECT toDateTime('0000-00-00 00:00:00') AS dummy, {trunc_func}(toDateTime(%(date_to)s) - number * %(seconds_in_interval)s) as day_start from numbers(%(num_intervals)s) + ) as b WHERE has(day, subsequent_day) = 0 + ORDER BY person_id, subsequent_day ASC + ) WHERE + ((empty(toString(neighbor(person_id, -1))) OR neighbor(person_id, -1) != person_id) AND subsequent_day != {trunc_func}(toDateTime(%(date_from)s) + INTERVAL {interval} - INTERVAL {sub_interval})) + OR + ( (neighbor(person_id, -1) = person_id) AND neighbor(subsequent_day, -1) < subsequent_day - INTERVAL {interval}) + ) e + JOIN ( + SELECT DISTINCT person_id, {trunc_func}(min(events.timestamp)) earliest FROM events + JOIN + ({GET_TEAM_PERSON_DISTINCT_IDS}) pdi on events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s AND {event_query} {filters} + GROUP BY person_id + ) earliest ON e.person_id = earliest.person_id +) e +WHERE status = %(status)s +AND {trunc_func}(toDateTime(%(target_date)s)) = subsequent_day +LIMIT %(limit)s OFFSET %(offset)s +""" diff --git a/ee/clickhouse/sql/trends/top_elements.py b/ee/clickhouse/sql/trends/top_elements.py index efc39ec101ebc..beff7b7c2d4ca 100644 --- a/ee/clickhouse/sql/trends/top_elements.py +++ b/ee/clickhouse/sql/trends/top_elements.py @@ -1,15 +1,15 @@ TOP_ELEMENTS_ARRAY_OF_KEY_SQL = """ SELECT groupArray(value) FROM ( - SELECT value, count(*) as count - FROM - events e INNER JOIN - ( - SELECT * - FROM events_properties_view AS ep - WHERE key = %(key)s AND team_id = %(team_id)s - ) ep ON e.uuid = ep.event_id WHERE team_id = %(team_id)s {parsed_date_from} {parsed_date_to} + SELECT + {value_expression} AS value, + {aggregate_operation} as count + FROM events e + {person_join_clauses} + {groups_join_clauses} + WHERE + team_id = %(team_id)s {entity_query} {parsed_date_from} {parsed_date_to} {prop_filters} GROUP BY value ORDER BY count DESC - LIMIT %(limit)s + LIMIT %(limit)s OFFSET %(offset)s ) """ diff --git a/ee/clickhouse/sql/trends/top_person_props.py b/ee/clickhouse/sql/trends/top_person_props.py deleted file mode 100644 index 182f3d2d48a5e..0000000000000 --- a/ee/clickhouse/sql/trends/top_person_props.py +++ /dev/null @@ -1,29 +0,0 @@ -TOP_PERSON_PROPS_ARRAY_OF_KEY_SQL = """ -SELECT groupArray(value) FROM ( - SELECT value, count(*) as count - FROM - events e - INNER JOIN (SELECT person_id, distinct_id FROM person_distinct_id WHERE team_id = %(team_id)s) as pid ON e.distinct_id = pid.distinct_id - INNER JOIN - ( - SELECT * FROM ( - SELECT - id, - array_property_keys as key, - array_property_values as value - from ( - SELECT - id, - arrayMap(k -> toString(k.1), JSONExtractKeysAndValuesRaw(properties)) AS array_property_keys, - arrayMap(k -> toString(k.2), JSONExtractKeysAndValuesRaw(properties)) AS array_property_values - FROM ({latest_person_sql}) person WHERE team_id = %(team_id)s - ) - ARRAY JOIN array_property_keys, array_property_values - ) ep - WHERE key = %(key)s - ) ep ON person_id = ep.id WHERE e.team_id = %(team_id)s {parsed_date_from} {parsed_date_to} - GROUP BY value - ORDER BY count DESC - LIMIT %(limit)s -) -""" diff --git a/ee/clickhouse/sql/trends/volume.py b/ee/clickhouse/sql/trends/volume.py index 9a99299d12779..239cb79cb9e3f 100644 --- a/ee/clickhouse/sql/trends/volume.py +++ b/ee/clickhouse/sql/trends/volume.py @@ -1,7 +1,35 @@ VOLUME_SQL = """ -SELECT {aggregate_operation} as total, toDateTime({interval}({timestamp}), 'UTC') as day_start from events {event_join} where team_id = {team_id} and event = %(event)s {filters} {parsed_date_from} {parsed_date_to} GROUP BY {interval}({timestamp}) +SELECT {aggregate_operation} as data, toDateTime({interval}(timestamp), 'UTC') as date FROM ({event_query}) GROUP BY {interval}(timestamp) """ -VOLUME_ACTIONS_SQL = """ -SELECT {aggregate_operation} as total, toDateTime({interval}({timestamp}), 'UTC') as day_start from events {event_join} where team_id = {team_id} and {actions_query} {filters} {parsed_date_from} {parsed_date_to} GROUP BY {interval}({timestamp}) +VOLUME_TOTAL_AGGREGATE_SQL = """ +SELECT {aggregate_operation} as data FROM ({event_query}) events +""" + +ACTIVE_USER_SQL = """ +SELECT counts as total, timestamp as day_start FROM ( + SELECT d.timestamp, COUNT(DISTINCT person_id) counts FROM ( + SELECT toStartOfDay(timestamp) as timestamp FROM events WHERE team_id = %(team_id)s {parsed_date_from_prev_range} {parsed_date_to} GROUP BY timestamp + ) d + CROSS JOIN ( + SELECT toStartOfDay(timestamp) as timestamp, person_id FROM ({event_query}) events WHERE 1 = 1 {parsed_date_from_prev_range} {parsed_date_to} GROUP BY timestamp, person_id + ) e WHERE e.timestamp <= d.timestamp AND e.timestamp > d.timestamp - INTERVAL {prev_interval} + GROUP BY d.timestamp + ORDER BY d.timestamp +) WHERE 1 = 1 {parsed_date_from} {parsed_date_to} +""" + +PERSONS_ACTIVE_USER_SQL = """ +SELECT DISTINCT person_id FROM ( + SELECT d.timestamp, person_id FROM ( + SELECT toStartOfDay(timestamp) as timestamp FROM events WHERE team_id = %(team_id)s {parsed_date_from_prev_range} {parsed_date_to} GROUP BY timestamp + ) d + CROSS JOIN ( + SELECT toStartOfDay(timestamp) as timestamp, person_id FROM events INNER JOIN ( + {GET_TEAM_PERSON_DISTINCT_IDS} + ) AS pdi + ON events.distinct_id = pdi.distinct_id + WHERE team_id = %(team_id)s {entity_query} {filters} {parsed_date_from_prev_range} {parsed_date_to} GROUP BY timestamp, person_id + ) e WHERE e.timestamp <= d.timestamp AND e.timestamp > d.timestamp - INTERVAL {prev_interval} +) WHERE 1 = 1 {parsed_date_from} {parsed_date_to} """ diff --git a/ee/clickhouse/system_status.py b/ee/clickhouse/system_status.py new file mode 100644 index 0000000000000..623ebd844fb91 --- /dev/null +++ b/ee/clickhouse/system_status.py @@ -0,0 +1,254 @@ +import glob +import subprocess +import tempfile +import uuid +from os.path import abspath, basename, dirname, join +from typing import Dict, Generator, List, Tuple + +import sqlparse +from clickhouse_driver import Client +from dateutil.relativedelta import relativedelta +from django.utils import timezone +from sentry_sdk.api import capture_exception + +from ee.clickhouse.client import make_ch_pool, sync_execute +from ee.clickhouse.models.event import get_event_count, get_event_count_for_last_month, get_event_count_month_to_date +from posthog.settings import CLICKHOUSE_PASSWORD, CLICKHOUSE_STABLE_HOST, CLICKHOUSE_USER + +SLOW_THRESHOLD_MS = 10000 +SLOW_AFTER = relativedelta(hours=6) + +CLICKHOUSE_FLAMEGRAPH_EXECUTABLE = abspath(join(dirname(__file__), "bin", "clickhouse-flamegraph")) +FLAMEGRAPH_PL = abspath(join(dirname(__file__), "bin", "flamegraph.pl")) + +SystemStatusRow = Dict + + +def system_status() -> Generator[SystemStatusRow, None, None]: + alive = is_alive() + yield {"key": "clickhouse_alive", "metric": "Clickhouse database alive", "value": alive} + + if not alive: + return + + yield {"key": "clickhouse_event_count", "metric": "Events in ClickHouse", "value": get_event_count()} + yield { + "key": "clickhouse_event_count_last_month", + "metric": "Events recorded last month", + "value": get_event_count_for_last_month(), + } + yield { + "key": "clickhouse_event_count_month_to_date", + "metric": "Events recorded month to date", + "value": get_event_count_month_to_date(), + } + + disk_status = sync_execute( + "SELECT formatReadableSize(total_space), formatReadableSize(free_space) FROM system.disks" + ) + + for index, (total_space, free_space) in enumerate(disk_status): + metric = "Clickhouse disk" if len(disk_status) == 1 else f"Clickhouse disk {index}" + yield {"key": f"clickhouse_disk_{index}_free_space", "metric": f"{metric} free space", "value": free_space} + yield {"key": f"clickhouse_disk_{index}_total_space", "metric": f"{metric} total space", "value": total_space} + + table_sizes = sync_execute( + """ + SELECT + table, + formatReadableSize(sum(bytes)) AS size, + sum(rows) AS rows + FROM system.parts + WHERE active + GROUP BY table + ORDER BY rows DESC + """ + ) + + yield { + "key": "clickhouse_table_sizes", + "metric": "Clickhouse table sizes", + "value": "", + "subrows": {"columns": ["Table", "Size", "Rows"], "rows": table_sizes}, + } + + system_metrics = sync_execute("SELECT * FROM system.asynchronous_metrics") + system_metrics += sync_execute("SELECT * FROM system.metrics") + + yield { + "key": "clickhouse_system_metrics", + "metric": "Clickhouse system metrics", + "value": "", + "subrows": {"columns": ["Metric", "Value", "Description"], "rows": list(sorted(system_metrics))}, + } + + +def is_alive() -> bool: + try: + sync_execute("SELECT 1") + return True + except: + return False + + +def get_clickhouse_running_queries() -> List[Dict]: + return query_with_columns( + "SELECT elapsed as duration, query, * FROM system.processes ORDER BY duration DESC", + columns_to_remove=["address", "initial_address", "elapsed"], + ) + + +def get_clickhouse_slow_log() -> List[Dict]: + return query_with_columns( + f""" + SELECT query_duration_ms as duration, query, * + FROM system.query_log + WHERE query_duration_ms > {SLOW_THRESHOLD_MS} + AND event_time > %(after)s + AND query NOT LIKE '%%system.query_log%%' + AND query NOT LIKE '%%analyze_query:%%' + ORDER BY duration DESC + LIMIT 200 + """, + {"after": timezone.now() - SLOW_AFTER}, + columns_to_remove=[ + "address", + "initial_address", + "query_duration_ms", + "event_time", + "event_date", + "query_start_time_microseconds", + "thread_ids", + "ProfileEvents.Names", + "ProfileEvents.Values", + "Settings.Names", + "Settings.Values", + ], + ) + + +def query_with_columns(query, args=None, columns_to_remove=[]) -> List[Dict]: + metrics, types = sync_execute(query, args, with_column_types=True) + type_names = [key for key, _type in types] + + rows = [] + for row in metrics: + result = {} + for type_name, value in zip(type_names, row): + if isinstance(value, list): + value = ", ".join(map(str, value)) + if type_name not in columns_to_remove: + result[type_name] = value + + rows.append(result) + + return rows + + +def analyze_query(query: str): + random_id = str(uuid.uuid4()) + + # :TRICKY: Ensure all queries run on the same host. + ch_pool = make_ch_pool(host=CLICKHOUSE_STABLE_HOST) + + with ch_pool.get_client() as conn: + conn.execute( + f""" + -- analyze_query:{random_id} + {query} + """, + settings={ + "allow_introspection_functions": 1, + "query_profiler_real_time_period_ns": 40000000, + "query_profiler_cpu_time_period_ns": 40000000, + "memory_profiler_step": 1048576, + "max_untracked_memory": 1048576, + "memory_profiler_sample_probability": 0.01, + "use_uncompressed_cache": 0, + "readonly": 1, + "allow_ddl": 0, + }, + ) + + query_id, timing_info = get_query_timing_info(random_id, conn) + + return { + "query": sqlparse.format(query, reindent_aligned=True), + "timing": timing_info, + "flamegraphs": get_flamegraphs(query_id), + } + + +def get_query_timing_info(random_id: str, conn: Client) -> Tuple[str, Dict]: + conn.execute("SYSTEM FLUSH LOGS") + results = conn.execute( + """ + SELECT + query_id, + event_time, + query_duration_ms, + read_rows, + formatReadableSize(read_bytes) as read_size, + result_rows, + formatReadableSize(result_bytes) as result_size, + formatReadableSize(memory_usage) as memory_usage + FROM system.query_log + WHERE query NOT LIKE '%%query_log%%' + AND match(query, %(expr)s) + AND type = 'QueryFinish' + LIMIT 1 + """, + {"expr": f"analyze_query:{random_id}"}, + ) + + return ( + results[0][0], + dict( + zip( + [ + "query_id", + "event_time", + "query_duration_ms", + "read_rows", + "read_size", + "result_rows", + "result_size", + "memory_usage", + ], + results[0], + ) + ), + ) + + +def get_flamegraphs(query_id: str) -> Dict: + try: + with tempfile.TemporaryDirectory() as tmpdirname: + subprocess.run( + [ + CLICKHOUSE_FLAMEGRAPH_EXECUTABLE, + "--query-id", + query_id, + "--clickhouse-dsn", + f"http://{CLICKHOUSE_USER}:{CLICKHOUSE_PASSWORD}@{CLICKHOUSE_STABLE_HOST}:8123/", + "--console", + "--flamegraph-script", + FLAMEGRAPH_PL, + "--date-from", + "2021-01-01", + "--width", + "1900", + ], + cwd=tmpdirname, + check=True, + ) + + flamegraphs = {} + for file_path in glob.glob(join(tmpdirname, "*/*/global*.svg")): + with open(file_path) as file: + flamegraphs[basename(file_path)] = file.read() + + return flamegraphs + except Exception as err: + capture_exception(err) + return {} diff --git a/ee/clickhouse/test/test_calculate_event_property_usage.py b/ee/clickhouse/test/test_calculate_event_property_usage.py index dfcc69564c035..ccfbb85a20975 100644 --- a/ee/clickhouse/test/test_calculate_event_property_usage.py +++ b/ee/clickhouse/test/test_calculate_event_property_usage.py @@ -3,7 +3,7 @@ from ee.clickhouse.models.event import create_event from ee.clickhouse.util import ClickhouseTestMixin from posthog.models.event import Event -from posthog.tasks.test.test_calculate_event_property_usage import test_calculate_event_property_usage +from posthog.tasks.test.test_calculate_event_property_usage import calculate_event_property_usage_test_factory def _create_event(**kwargs) -> Event: @@ -14,6 +14,6 @@ def _create_event(**kwargs) -> Event: class CalculateEventPropertyUsage( - ClickhouseTestMixin, test_calculate_event_property_usage(_create_event), # type: ignore + ClickhouseTestMixin, calculate_event_property_usage_test_factory(_create_event), # type: ignore ): pass diff --git a/ee/clickhouse/test/test_client.py b/ee/clickhouse/test/test_client.py index e259bc9b08d08..fdf9a53519eb5 100644 --- a/ee/clickhouse/test/test_client.py +++ b/ee/clickhouse/test/test_client.py @@ -4,10 +4,12 @@ from django.test import TestCase from freezegun import freeze_time -from ee.clickhouse.client import CACHE_TTL, _deserialize, _key_hash, cache_sync_execute +from ee.clickhouse import client +from ee.clickhouse.client import CACHE_TTL, _deserialize, _key_hash, cache_sync_execute, sync_execute +from ee.clickhouse.util import ClickhouseTestMixin -class ClickhouseClientTestCase(TestCase): +class ClickhouseClientTestCase(TestCase, ClickhouseTestMixin): def setUp(self): self.redis_client = fakeredis.FakeStrictRedis() @@ -43,3 +45,33 @@ def test_cache_eviction(self): with freeze_time(start + datetime.timedelta(seconds=CACHE_TTL + 10)): exists = self.redis_client.exists(_key_hash(query, args=args)) self.assertFalse(exists) + + def test_client_strips_comments_from_request(self): + """ + To ensure we can easily copy queries from `system.query_log` in e.g. + Metabase, we strip comments from the query we send. Metabase doesn't + display multilined output. + + See https://github.com/metabase/metabase/issues/14253 + + Note I'm not really testing much complexity, I trust that those will + come out as failures in other tests. + """ + # First add in the request information that should be added to the sql. + # We check this to make sure it is not removed by the comment stripping + with self.capture_select_queries() as sqls: + client._request_information = {"kind": "request", "id": "1"} + sync_execute( + query=""" + -- this request returns 1 + SELECT 1 + """ + ) + self.assertEqual(len(sqls), 1) + first_query = sqls[0] + self.assertIn(f"SELECT 1", first_query) + self.assertNotIn("this request returns", first_query) + + # Make sure it still includes the "annotation" comment that includes + # request routing information for debugging purposes + self.assertIn("/* request:1 */", first_query) diff --git a/ee/clickhouse/test/test_error.py b/ee/clickhouse/test/test_error.py new file mode 100644 index 0000000000000..dd330f36cfff7 --- /dev/null +++ b/ee/clickhouse/test/test_error.py @@ -0,0 +1,26 @@ +import pytest +from clickhouse_driver.errors import ServerException + +from ee.clickhouse.errors import wrap_query_error +from posthog.exceptions import EstimatedQueryExecutionTimeTooLong + + +@pytest.mark.parametrize( + "error,expected_type,expected_message,expected_code", + [ + (AttributeError("Foobar"), "AttributeError", "Foobar", None), + ( + ServerException("Estimated query execution time (34.5 seconds) is too long. Aborting query"), + "EstimatedQueryExecutionTimeTooLong", + "Estimated query execution time (34.5 seconds) is too long.", + None, + ), + (ServerException("Syntax error", code=62), "CHQueryErrorSyntaxError", "Code: 62.\nSyntax error", 62), + (ServerException("Syntax error", code=9999), "CHQueryErrorUnknown", "Code: 9999.\nSyntax error", 9999), + ], +) +def test_wrap_query_error(error, expected_type, expected_message, expected_code): + new_error = wrap_query_error(error) + assert type(new_error).__name__ == expected_type + assert str(new_error) == expected_message + assert getattr(new_error, "code", None) == expected_code diff --git a/ee/clickhouse/test/test_middleware.py b/ee/clickhouse/test/test_middleware.py new file mode 100644 index 0000000000000..d3c8c8709e04c --- /dev/null +++ b/ee/clickhouse/test/test_middleware.py @@ -0,0 +1,24 @@ +import json + +from ee.api.test.base import APILicensedTest +from posthog.models import User + + +class TestQueryMiddleware(APILicensedTest): + def test_query(self): + self.user.is_staff = True + self.user.save() + response = self.client.get( + f'/api/projects/{self.team.id}/insights/trend/?events={json.dumps([{"id": "$pageview"}])}' + ) + self.assertEqual(response.status_code, 200) + response = self.client.get("/api/debug_ch_queries/").json() + self.assertIn("SELECT", response[0]["query"]) # type: ignore + + #  Test saving queries if we're impersonating a user + user2 = User.objects.create_and_join(organization=self.organization, email="test", password="bla") + self.client.post("/admin/login/user/{}/".format(user2.pk)) + self.client.get(f'/api/projects/{self.team.id}/insights/trend/?events={json.dumps([{"id": "$pageleave"}])}') + + response = self.client.get("/api/debug_ch_queries/").json() + self.assertIn("SELECT", response[0]["query"]) # type: ignore diff --git a/ee/clickhouse/test/test_process_event_ee.py b/ee/clickhouse/test/test_process_event_ee.py deleted file mode 100644 index 64f389170b9cc..0000000000000 --- a/ee/clickhouse/test/test_process_event_ee.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -from typing import Any, Dict, List, Optional, Union -from uuid import UUID - -from dateutil import parser - -from ee.clickhouse.client import sync_execute -from ee.clickhouse.models.element import chain_to_elements -from ee.clickhouse.process_event import process_event_ee -from ee.clickhouse.sql.session_recording_events import SESSION_RECORDING_EVENTS_TABLE -from ee.clickhouse.util import ClickhouseTestMixin -from posthog.models.element import Element -from posthog.models.event import Event -from posthog.models.session_recording_event import SessionRecordingEvent -from posthog.tasks.test.test_process_event import test_process_event_factory - - -def get_session_recording_events(): - return [ - SessionRecordingEvent( - id=event[0], session_id=event[1], distinct_id=event[2], snapshot_data=json.loads(event[3]) - ) - for event in sync_execute( - "select uuid, session_id, distinct_id, snapshot_data from {}".format(SESSION_RECORDING_EVENTS_TABLE) - ) - ] - - -def _get_events(): - return [ - Event(id=ev[0], properties=json.loads(ev[1]), distinct_id=ev[2], event=ev[3], timestamp=ev[4]) - for ev in sync_execute("select uuid, properties, distinct_id, event, timestamp from events") - ] - - -def get_elements(event_id: Union[int, UUID]) -> List[Element]: - return chain_to_elements( - sync_execute("select elements_chain from events where uuid = %(id)s", {"id": event_id})[0][0] - ) - - -def _process_event_ee( - distinct_id: str, ip: str, site_url: str, data: dict, team_id: int, now: str, sent_at: Optional[str], -) -> None: - return process_event_ee( - distinct_id=distinct_id, - ip=ip, - site_url=site_url, - data=data, - team_id=team_id, - now=parser.isoparse(now), - sent_at=parser.isoparse(sent_at) if sent_at else None, - ) - - -class ClickhouseProcessEvent( - ClickhouseTestMixin, - test_process_event_factory(_process_event_ee, _get_events, get_session_recording_events, get_elements), # type: ignore -): - pass diff --git a/ee/clickhouse/test/test_system_status.py b/ee/clickhouse/test/test_system_status.py new file mode 100644 index 0000000000000..3f776c54ed454 --- /dev/null +++ b/ee/clickhouse/test/test_system_status.py @@ -0,0 +1,17 @@ +from ee.clickhouse.system_status import system_status + + +def test_system_status(db): + results = list(system_status()) + assert [row["key"] for row in results] == [ + "clickhouse_alive", + "clickhouse_event_count", + "clickhouse_event_count_last_month", + "clickhouse_event_count_month_to_date", + "clickhouse_disk_0_free_space", + "clickhouse_disk_0_total_space", + "clickhouse_table_sizes", + "clickhouse_system_metrics", + ] + assert len(results[-2]["subrows"]["rows"]) > 0 + assert len(results[-1]["subrows"]["rows"]) > 0 diff --git a/ee/clickhouse/timer.py b/ee/clickhouse/timer.py new file mode 100644 index 0000000000000..8d64033c348e2 --- /dev/null +++ b/ee/clickhouse/timer.py @@ -0,0 +1,117 @@ +import logging +import uuid +from collections import OrderedDict +from functools import partial +from threading import Condition, Thread +from time import perf_counter +from typing import Callable, Dict, Optional, Tuple + +from django.conf import settings + +logger = logging.getLogger(__name__) + + +class TimerTask: + id: str + done: bool + + def __init__(self, callback: Callable, *args, **kwargs): + self.callback = partial(callback, *args, **kwargs) + self.id = str(uuid.uuid4()) + self.done = False + + def run(self): + self.done = True + try: + self.callback() + except Exception as err: + logger.warn("TimerTask failed, ignoring error", err) + + +class SingleThreadedTimer(Thread): + def __init__(self, timeout_ms: int, *args, **kwargs): + super().__init__(*args, **kwargs) + self.setDaemon(True) + + self.timeout_ms = timeout_ms + self.started = False + self.lock = Condition() + self.tasks: OrderedDict = OrderedDict() + + def schedule(self, callback: Callable, *args, **kwargs) -> TimerTask: + """ + Schedules a task to be called in `timeout_ms`. Returns a TimerTask instance, + which can be cancelled via `.cancel` + + First call to this starts a background daemon thread. + """ + self.start() + + with self.lock: + task = TimerTask(callback, *args, **kwargs) + self.tasks[task.id] = (task, perf_counter()) + self.lock.notify() + + return task + + def cancel(self, task: TimerTask) -> None: + with self.lock: + try: + del self.tasks[task.id] + except: + pass + self.lock.notify() + + # :TRICKY: We override start() to make it easy to start the thread when scheduling the first task + def start(self): + if not self.started: + self.started = True + super().start() + + def run(self): + while True: + job = None + with self.lock: + sleep = self._sleep_time_until_next_task() + if len(self.tasks) == 0: + # Wait until a task is scheduled + self.lock.wait() + elif sleep > 0: + self.lock.wait(sleep) + else: + _, (job, _) = self.tasks.popitem(last=False) + + if job is not None: + job.run() + + def _next_task(self) -> Optional[Tuple[TimerTask, float]]: + for _, task_and_time in self.tasks.items(): + return task_and_time + return None + + def _sleep_time_until_next_task(self) -> float: + "Return time until the next task should be executed, if any task is scheduled" + next_task = self._next_task() + if next_task is None: + return 0 + else: + _, start_time = next_task + return start_time + self.timeout_ms / 1000.0 - perf_counter() + + +class TestSingleThreadedTimer(SingleThreadedTimer): + def run(self): + pass + + +_threads: Dict[str, SingleThreadedTimer] = {} + + +def get_timer_thread(name: str, timeout_ms: int) -> SingleThreadedTimer: + if settings.TEST: + return TestSingleThreadedTimer(timeout_ms=timeout_ms) + + if name not in _threads: + _threads[name] = SingleThreadedTimer(timeout_ms=timeout_ms) + + return _threads[name] diff --git a/ee/clickhouse/util.py b/ee/clickhouse/util.py index ba62cb3929992..159971b805119 100644 --- a/ee/clickhouse/util.py +++ b/ee/clickhouse/util.py @@ -1,70 +1,27 @@ +import re from contextlib import contextmanager +from functools import wraps +from typing import Any +from unittest.mock import patch -from clickhouse_driver.errors import ServerException +import pytest +import sqlparse from django.db import DEFAULT_DB_ALIAS -from ee.clickhouse.client import sync_execute -from ee.clickhouse.sql.events import ( - DROP_EVENTS_TABLE_SQL, - DROP_EVENTS_WITH_ARRAY_PROPS_TABLE_SQL, - DROP_MAT_EVENTS_PROP_TABLE_SQL, - DROP_MAT_EVENTS_WITH_ARRAY_PROPS_TABLE_SQL, - EVENTS_TABLE_SQL, - EVENTS_WITH_PROPS_TABLE_SQL, - MAT_EVENT_PROP_TABLE_SQL, - MAT_EVENTS_WITH_PROPS_TABLE_SQL, -) -from ee.clickhouse.sql.person import ( - DROP_PERSON_DISTINCT_ID_TABLE_SQL, - DROP_PERSON_TABLE_SQL, - PERSONS_DISTINCT_ID_TABLE_SQL, - PERSONS_TABLE_SQL, -) -from ee.clickhouse.sql.session_recording_events import ( - DROP_SESSION_RECORDING_EVENTS_TABLE_SQL, - SESSION_RECORDING_EVENTS_TABLE_SQL, -) +from ee.clickhouse.client import ch_pool, sync_execute +from ee.clickhouse.sql.events import DROP_EVENTS_TABLE_SQL, EVENTS_TABLE_SQL +from ee.clickhouse.sql.person import DROP_PERSON_TABLE_SQL, PERSONS_TABLE_SQL +from posthog.test.base import BaseTest +@pytest.mark.usefixtures("unittest_snapshot") class ClickhouseTestMixin: - def tearDown(self): - try: - self._destroy_event_tables() - self._destroy_person_tables() - self._destroy_session_recording_tables() - - self._create_event_tables() - self._create_person_tables() - self._create_session_recording_tables() - except ServerException as e: - print(e) - pass - - def _destroy_person_tables(self): - sync_execute(DROP_PERSON_TABLE_SQL) - sync_execute(DROP_PERSON_DISTINCT_ID_TABLE_SQL) - - def _create_person_tables(self): - sync_execute(PERSONS_TABLE_SQL) - sync_execute(PERSONS_DISTINCT_ID_TABLE_SQL) - - def _destroy_session_recording_tables(self): - sync_execute(DROP_SESSION_RECORDING_EVENTS_TABLE_SQL) + RUN_MATERIALIZED_COLUMN_TESTS = True + # overrides the basetest in posthog/test/base.py + #  this way the team id will increment so we don't have to destroy all clickhouse tables on each test + CLASS_DATA_LEVEL_SETUP = False - def _create_session_recording_tables(self): - sync_execute(SESSION_RECORDING_EVENTS_TABLE_SQL) - - def _destroy_event_tables(self): - sync_execute(DROP_EVENTS_TABLE_SQL) - sync_execute(DROP_EVENTS_WITH_ARRAY_PROPS_TABLE_SQL) - sync_execute(DROP_MAT_EVENTS_WITH_ARRAY_PROPS_TABLE_SQL) - sync_execute(DROP_MAT_EVENTS_PROP_TABLE_SQL) - - def _create_event_tables(self): - sync_execute(EVENTS_TABLE_SQL) - sync_execute(EVENTS_WITH_PROPS_TABLE_SQL) - sync_execute(MAT_EVENTS_WITH_PROPS_TABLE_SQL) - sync_execute(MAT_EVENT_PROP_TABLE_SQL) + snapshot: Any @contextmanager def _assertNumQueries(self, func): @@ -73,3 +30,80 @@ def _assertNumQueries(self, func): # Ignore assertNumQueries in clickhouse tests def assertNumQueries(self, num, func=None, *args, using=DEFAULT_DB_ALIAS, **kwargs): return self._assertNumQueries(func) + + # :NOTE: Update snapshots by passing --snapshot-update to bin/tests + def assertQueryMatchesSnapshot(self, query, params=None): + # :TRICKY: team_id changes every test, avoid it messing with snapshots. + query = re.sub(r"(team|cohort)_id = \d+", r"\1_id = 2", query) + + assert sqlparse.format(query, reindent=True) == self.snapshot, "\n".join(self.snapshot.get_assert_diff()) + if params is not None: + del params["team_id"] # Changes every run + assert params == self.snapshot, "\n".join(self.snapshot.get_assert_diff()) + + @contextmanager + def capture_select_queries(self): + queries = [] + original_get_client = ch_pool.get_client + + # Spy on the `clichhouse_driver.Client.execute` method. This is a bit of + # a roundabout way to handle this, but it seems tricky to spy on the + # unbound class method `Client.execute` directly easily + @contextmanager + def get_client(): + with original_get_client() as client: + original_client_execute = client.execute + + def execute_wrapper(query, *args, **kwargs): + if sqlparse.format(query, strip_comments=True).strip().startswith("SELECT"): + queries.append(query) + return original_client_execute(query, *args, **kwargs) + + with patch.object(client, "execute", wraps=execute_wrapper) as _: + yield client + + with patch("ee.clickhouse.client.ch_pool.get_client", wraps=get_client) as _: + yield queries + + +class ClickhouseDestroyTablesMixin(BaseTest): + """ + To speed up tests we normally don't destroy the tables between tests, so clickhouse tables will have data from previous tests. + Use this mixin to make sure you completely destroy the tables between tests. + """ + + def setUp(self): + super().setUp() + sync_execute(DROP_EVENTS_TABLE_SQL) + sync_execute(EVENTS_TABLE_SQL) + sync_execute(DROP_PERSON_TABLE_SQL) + sync_execute(PERSONS_TABLE_SQL) + + def tearDown(self): + super().tearDown() + sync_execute(DROP_EVENTS_TABLE_SQL) + sync_execute(EVENTS_TABLE_SQL) + sync_execute(DROP_PERSON_TABLE_SQL) + sync_execute(PERSONS_TABLE_SQL) + + +def snapshot_clickhouse_queries(fn): + """ + Captures and snapshots select queries from test using `syrupy` library. + + Requires queries to be stable to avoid flakiness. + + Snapshots are automatically saved in a __snapshot__/*.ambr file. + Update snapshots via --update-snapshots. + """ + + @wraps(fn) + def wrapped(self, *args, **kwargs): + with self.capture_select_queries() as queries: + fn(self, *args, **kwargs) + + for query in queries: + if "FROM system.columns" not in query: + self.assertQueryMatchesSnapshot(query) + + return wrapped diff --git a/ee/clickhouse/views/actions.py b/ee/clickhouse/views/actions.py index 9d56cc92f36d9..dc5962b76b6c2 100644 --- a/ee/clickhouse/views/actions.py +++ b/ee/clickhouse/views/actions.py @@ -1,93 +1,48 @@ # NOTE: bad django practice but /ee specifically depends on /posthog so it should be fine -from datetime import timedelta -from typing import Any, Dict, List, Optional, Tuple +from datetime import datetime +from typing import Any, Dict, List, Optional -from dateutil.relativedelta import relativedelta -from django.utils import timezone from rest_framework import serializers from rest_framework.decorators import action from rest_framework.request import Request from rest_framework.response import Response +from rest_framework_csv import renderers as csvrenderers from ee.clickhouse.client import sync_execute from ee.clickhouse.models.action import format_action_filter -from ee.clickhouse.models.cohort import format_filter_query -from ee.clickhouse.models.person import ClickhousePersonSerializer -from ee.clickhouse.models.property import parse_prop_clauses -from ee.clickhouse.queries.util import parse_timestamps -from ee.clickhouse.sql.person import GET_LATEST_PERSON_SQL, PEOPLE_SQL, PEOPLE_THROUGH_DISTINCT_SQL, PERSON_TREND_SQL -from ee.clickhouse.sql.stickiness.stickiness_people import STICKINESS_PEOPLE_SQL +from ee.clickhouse.queries.trends.person import TrendsPersonQuery +from ee.clickhouse.sql.person import INSERT_COHORT_ALL_PEOPLE_THROUGH_PERSON_ID, PERSON_STATIC_COHORT_TABLE from posthog.api.action import ActionSerializer, ActionViewSet -from posthog.constants import TREND_FILTER_TYPE_ACTIONS +from posthog.api.utils import get_target_entity from posthog.models.action import Action from posthog.models.cohort import Cohort from posthog.models.entity import Entity -from posthog.models.filter import Filter -from posthog.models.property import Property -from posthog.models.team import Team +from posthog.models.filters import Filter class ClickhouseActionSerializer(ActionSerializer): is_calculating = serializers.SerializerMethodField() - def get_count(self, action: Action) -> Optional[int]: - if self.context.get("view") and self.context["view"].action != "list": - query, params = format_action_filter(action) - if query == "": - return None - return sync_execute( - "SELECT count(1) FROM events WHERE team_id = %(team_id)s AND {}".format(query), - {"team_id": action.team_id, **params}, - )[0][0] - return None - def get_is_calculating(self, action: Action) -> bool: return False -class ClickhouseActions(ActionViewSet): +class ClickhouseActionsViewSet(ActionViewSet): serializer_class = ClickhouseActionSerializer - # Don't calculate actions in Clickhouse as it's on the fly - def _calculate_action(self, action: Action) -> None: - pass - def list(self, request: Request, *args: Any, **kwargs: Any) -> Response: actions = self.get_queryset() actions_list: List[Dict[Any, Any]] = self.serializer_class(actions, many=True, context={"request": request}).data # type: ignore return Response({"results": actions_list}) @action(methods=["GET"], detail=False) - def people(self, request: Request, *args: Any, **kwargs: Any) -> Response: - - team = request.user.team - filter = Filter(request=request) - shown_as = request.GET.get("shown_as") - - if len(filter.entities) >= 1: - entity = filter.entities[0] - else: - entity = Entity({"id": request.GET["entityId"], "type": request.GET["type"]}) - - # adhoc date handling. parsed differently with django orm - date_from = filter.date_from or timezone.now() - if filter.interval == "month": - filter._date_to = (date_from + relativedelta(months=1) - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") - elif filter.interval == "week": - filter._date_to = date_from + timedelta(weeks=1) - elif filter.interval == "hour": - filter._date_to = date_from + timedelta(hours=1) - elif filter.interval == "minute": - filter._date_to = date_from + timedelta(minutes=1) + def people(self, request: Request, *args: Any, **kwargs: Any) -> Response: # type: ignore + team = self.team + filter = Filter(request=request, team=self.team) + entity = get_target_entity(request) current_url = request.get_full_path() - - if shown_as is not None and shown_as == "Stickiness": - stickiness_day = int(request.GET["stickiness_days"]) - serialized_people = self._calculate_stickiness_entity_people(team, entity, filter, stickiness_day) - - else: - serialized_people = self._calculate_entity_people(team, entity, filter) + serialized_people = TrendsPersonQuery(team, entity, filter).get_people() current_url = request.get_full_path() next_url: Optional[str] = request.get_full_path() @@ -102,95 +57,50 @@ def people(self, request: Request, *args: Any, **kwargs: Any) -> Response: ) else: next_url = None + + if request.accepted_renderer.format == "csv": + csvrenderers.CSVRenderer.header = ["Distinct ID", "Internal ID", "Email", "Name", "Properties"] + content = [ + { + "Name": person.get("properties", {}).get("name"), + "Distinct ID": person.get("distinct_ids", [""])[0], + "Internal ID": person.get("id"), + "Email": person.get("properties", {}).get("email"), + "Properties": person.get("properties", {}), + } + for person in serialized_people + ] + return Response(content) + return Response( { - "results": [{"people": serialized_people[0:100], "count": len(serialized_people[0:99])}], + "results": [{"people": serialized_people[0:100], "count": len(serialized_people[0:100])}], "next": next_url, "previous": current_url[1:], } ) - def _format_entity_filter(self, entity: Entity) -> Tuple[str, Dict]: - if entity.type == TREND_FILTER_TYPE_ACTIONS: - try: - action = Action.objects.get(pk=entity.id) - action_query, params = format_action_filter(action) - entity_filter = "AND {}".format(action_query) + @action(methods=["GET"], detail=True) + def count(self, request: Request, **kwargs) -> Response: # type: ignore + action = self.get_object() + query, params = format_action_filter(action) + if query == "": + return Response({"count": 0}) - except Action.DoesNotExist: - raise ValueError("This action does not exist") - else: - entity_filter = "AND event = %(event)s" - params = {"event": entity.id} - - return entity_filter, params - - def _calculate_stickiness_entity_people(self, team: Team, entity: Entity, filter: Filter, stickiness_day: int): - parsed_date_from, parsed_date_to = parse_timestamps(filter=filter) - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) - entity_sql, entity_params = self._format_entity_filter(entity=entity) - - params: Dict = { - "team_id": team.pk, - **prop_filter_params, - "stickiness_day": stickiness_day, - **entity_params, - "offset": filter.offset, - } - - content_sql = STICKINESS_PEOPLE_SQL.format( - entity_filter=entity_sql, - parsed_date_from=(parsed_date_from or ""), - parsed_date_to=(parsed_date_to or ""), - filters="{filters}".format(filters=prop_filters) if filter.properties else "", + results = sync_execute( + "SELECT count(1) FROM events WHERE team_id = %(team_id)s AND {}".format(query), + {"team_id": action.team_id, **params}, ) + return Response({"count": results[0][0]}) - people = sync_execute( - PEOPLE_SQL.format( - content_sql=content_sql, query="", latest_person_sql=GET_LATEST_PERSON_SQL.format(query="") - ), - params, - ) - serialized_people = ClickhousePersonSerializer(people, many=True).data - - return serialized_people - - def _calculate_entity_people(self, team: Team, entity: Entity, filter: Filter): - parsed_date_from, parsed_date_to = parse_timestamps(filter=filter) - entity_sql, entity_params = self._format_entity_filter(entity=entity) - person_filter = "" - person_filter_params: Dict[str, Any] = {} - - if filter.breakdown_type == "cohort" and filter.breakdown_value != "all": - cohort = Cohort.objects.get(pk=filter.breakdown_value) - person_filter, person_filter_params = format_filter_query(cohort) - person_filter = "AND distinct_id IN ({})".format(person_filter) - elif ( - filter.breakdown_type == "person" - and isinstance(filter.breakdown, str) - and isinstance(filter.breakdown_value, str) - ): - person_prop = Property(**{"key": filter.breakdown, "value": filter.breakdown_value, "type": "person"}) - filter.properties.append(person_prop) - - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) - params: Dict = {"team_id": team.pk, **prop_filter_params, **entity_params, "offset": filter.offset} - - content_sql = PERSON_TREND_SQL.format( - entity_filter=entity_sql, - parsed_date_from=parsed_date_from, - parsed_date_to=parsed_date_to, - filters=prop_filters, - breakdown_filter="", - person_filter=person_filter, - ) - people = sync_execute( - PEOPLE_THROUGH_DISTINCT_SQL.format( - content_sql=content_sql, latest_person_sql=GET_LATEST_PERSON_SQL.format(query="") - ), - {**params, **person_filter_params}, - ) - serialized_people = ClickhousePersonSerializer(people, many=True).data +def insert_entity_people_into_cohort(cohort: Cohort, entity: Entity, filter: Filter): + query, params = TrendsPersonQuery(cohort.team, entity, filter).get_query() + sync_execute( + INSERT_COHORT_ALL_PEOPLE_THROUGH_PERSON_ID.format(cohort_table=PERSON_STATIC_COHORT_TABLE, query=query), + {"cohort_id": cohort.pk, "_timestamp": datetime.now(), **params}, + ) + - return serialized_people +class LegacyClickhouseActionsViewSet(ClickhouseActionsViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/cohort.py b/ee/clickhouse/views/cohort.py new file mode 100644 index 0000000000000..7f1bb60e69293 --- /dev/null +++ b/ee/clickhouse/views/cohort.py @@ -0,0 +1,43 @@ +from rest_framework.request import Request + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.queries.util import get_earliest_timestamp +from ee.clickhouse.sql.person import PERSON_STATIC_COHORT_TABLE +from posthog.api.cohort import CohortSerializer, CohortViewSet +from posthog.constants import INSIGHT_STICKINESS, INSIGHT_TRENDS +from posthog.models.cohort import Cohort +from posthog.models.entity import Entity +from posthog.models.filters.filter import Filter +from posthog.models.filters.stickiness_filter import StickinessFilter +from posthog.models.team import Team +from posthog.tasks.calculate_cohort import insert_cohort_from_query + + +class ClickhouseCohortSerializer(CohortSerializer): + earliest_timestamp_func = get_earliest_timestamp + + def _handle_stickiness_people(self, target_entity: Entity, cohort: Cohort, filter: StickinessFilter) -> None: + insert_cohort_from_query.delay( + cohort.pk, INSIGHT_STICKINESS, filter.to_dict(), entity_data=target_entity.to_dict() + ) + + def _handle_trend_people(self, target_entity: Entity, cohort: Cohort, filter: Filter, request: Request) -> None: + insert_cohort_from_query.delay(cohort.pk, INSIGHT_TRENDS, filter.to_dict(), entity_data=target_entity.to_dict()) + + +def insert_cohort_people_into_pg(cohort: Cohort): + ids = sync_execute( + "SELECT person_id FROM {} where team_id = %(team_id)s AND cohort_id = %(cohort_id)s".format( + PERSON_STATIC_COHORT_TABLE + ), + {"cohort_id": cohort.pk, "team_id": cohort.team.pk}, + ) + cohort.insert_users_list_by_uuid(items=[str(id[0]) for id in ids]) + + +class ClickhouseCohortViewSet(CohortViewSet): + serializer_class = ClickhouseCohortSerializer + + +class LegacyClickhouseCohortViewSet(ClickhouseCohortViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/element.py b/ee/clickhouse/views/element.py index b446df5e51a83..0067365542b43 100644 --- a/ee/clickhouse/views/element.py +++ b/ee/clickhouse/views/element.py @@ -7,20 +7,20 @@ from ee.clickhouse.queries.util import parse_timestamps from ee.clickhouse.sql.element import GET_ELEMENTS, GET_VALUES from posthog.api.element import ElementSerializer, ElementViewSet -from posthog.models.filter import Filter +from posthog.models.filters import Filter -class ClickhouseElement(ElementViewSet): +class ClickhouseElementViewSet(ElementViewSet): @action(methods=["GET"], detail=False) - def stats(self, request: request.Request) -> response.Response: - filter = Filter(request=request) + def stats(self, request: request.Request, **kwargs) -> response.Response: # type: ignore + filter = Filter(request=request, team=self.team) - date_from, date_to = parse_timestamps(filter) + date_from, date_to, date_params = parse_timestamps(filter, team_id=self.team.pk) - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, request.user.team.pk) + prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, self.team.pk) result = sync_execute( GET_ELEMENTS.format(date_from=date_from, date_to=date_to, query=prop_filters), - {"team_id": request.user.team.id, **prop_filter_params}, + {"team_id": self.team.pk, **prop_filter_params, **date_params}, ) return response.Response( [ @@ -34,7 +34,7 @@ def stats(self, request: request.Request) -> response.Response: ) @action(methods=["GET"], detail=False) - def values(self, request: request.Request) -> response.Response: + def values(self, request: request.Request, **kwargs) -> response.Response: # type: ignore key = request.GET.get("key") value = request.GET.get("value") select_regex = '[:|"]{}="(.*?)"'.format(key) @@ -44,10 +44,10 @@ def values(self, request: request.Request) -> response.Response: return response.Response([]) if key == "tag_name": - select_regex = "^([-_a-zA-Z0-9]*?)[\.|:]" + select_regex = r"^([-_a-zA-Z0-9]*?)[\.|:]" filter_regex = select_regex if value: - filter_regex = "^([-_a-zA-Z0-9]*?{}[-_a-zA-Z0-9]*?)[\.|:]".format(value) + filter_regex = r"^([-_a-zA-Z0-9]*?{}[-_a-zA-Z0-9]*?)[\.|:]".format(value) else: if value: filter_regex = '[:|"]{}=".*?{}.*?"'.format(key, value) @@ -55,6 +55,10 @@ def values(self, request: request.Request) -> response.Response: filter_regex = select_regex result = sync_execute( - GET_VALUES.format(), {"team_id": request.user.team.id, "regex": select_regex, "filter_regex": filter_regex} + GET_VALUES.format(), {"team_id": self.team.id, "regex": select_regex, "filter_regex": filter_regex} ) return response.Response([{"name": value[0]} for value in result]) + + +class LegacyClickhouseElementViewSet(ClickhouseElementViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/events.py b/ee/clickhouse/views/events.py index 8d95db66fb262..48d41b8ee1c6c 100644 --- a/ee/clickhouse/views/events.py +++ b/ee/clickhouse/views/events.py @@ -1,7 +1,10 @@ -from typing import Any, Dict, List, Optional +import json +from datetime import timedelta +from typing import Any, Dict, List, Optional, Union -from rest_framework import viewsets +from django.utils.timezone import now from rest_framework.decorators import action +from rest_framework.exceptions import NotFound from rest_framework.request import Request from rest_framework.response import Response @@ -11,104 +14,182 @@ from ee.clickhouse.models.person import get_persons_by_distinct_ids from ee.clickhouse.models.property import get_property_values_for_key, parse_prop_clauses from ee.clickhouse.queries.clickhouse_session_recording import SessionRecording -from ee.clickhouse.queries.util import parse_timestamps -from ee.clickhouse.sql.events import SELECT_EVENT_WITH_ARRAY_PROPS_SQL, SELECT_EVENT_WITH_PROP_SQL, SELECT_ONE_EVENT_SQL +from ee.clickhouse.queries.sessions.list import ClickhouseSessionsList +from ee.clickhouse.sql.events import ( + GET_CUSTOM_EVENTS, + SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL, + SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL, + SELECT_ONE_EVENT_SQL, +) from posthog.api.event import EventViewSet from posthog.models import Filter, Person, Team from posthog.models.action import Action -from posthog.utils import convert_property_value +from posthog.models.filters.sessions_filter import SessionEventsFilter, SessionsFilter +from posthog.models.session_recording_event import SessionRecordingViewed +from posthog.models.utils import UUIDT +from posthog.utils import convert_property_value, flatten -class ClickhouseEvents(EventViewSet): +class ClickhouseEventsViewSet(EventViewSet): + serializer_class = ClickhouseEventSerializer # type: ignore + def _get_people(self, query_result: List[Dict], team: Team) -> Dict[str, Any]: distinct_ids = [event[5] for event in query_result] persons = get_persons_by_distinct_ids(team.pk, distinct_ids) - distinct_to_person: Dict[str, Person] = {} for person in persons: for distinct_id in person.distinct_ids: distinct_to_person[distinct_id] = person return distinct_to_person - def list(self, request: Request, *args: Any, **kwargs: Any) -> Response: + def _query_events_list( + self, filter: Filter, team: Team, request: Request, long_date_from: bool = False, limit: int = 100 + ) -> List: + limit += 1 + limit_sql = "LIMIT %(limit)s" + order = "DESC" if self._parse_order_by(self.request)[0] == "-timestamp" else "ASC" + + conditions, condition_params = determine_event_conditions( + team, + { + "after": (now() - timedelta(days=1)).isoformat(), + "before": (now() + timedelta(seconds=5)).isoformat(), + **request.GET.dict(), + }, + long_date_from, + ) + prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk, has_person_id_joined=False) - team = request.user.team - filter = Filter(request=request) - if request.GET.get("after"): - filter._date_from = request.GET["after"] - if request.GET.get("before"): - filter._date_to = request.GET["before"] - limit = "LIMIT 101" - conditions, condition_params = determine_event_conditions(request.GET.dict()) - prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk) if request.GET.get("action_id"): - action = Action.objects.get(pk=request.GET["action_id"]) + try: + action = Action.objects.get(pk=request.GET["action_id"], team_id=team.pk) + except Action.DoesNotExist: + return [] if action.steps.count() == 0: - return Response({"next": False, "results": []}) + return [] action_query, params = format_action_filter(action) prop_filters += " AND {}".format(action_query) prop_filter_params = {**prop_filter_params, **params} if prop_filters != "": - query_result = sync_execute( - SELECT_EVENT_WITH_PROP_SQL.format(conditions=conditions, limit=limit, filters=prop_filters), - {"team_id": team.pk, **condition_params, **prop_filter_params}, + return sync_execute( + SELECT_EVENT_BY_TEAM_AND_CONDITIONS_FILTERS_SQL.format( + conditions=conditions, limit=limit_sql, filters=prop_filters, order=order + ), + {"team_id": team.pk, "limit": limit, **condition_params, **prop_filter_params}, ) else: - query_result = sync_execute( - SELECT_EVENT_WITH_ARRAY_PROPS_SQL.format(conditions=conditions, limit=limit), - {"team_id": team.pk, **condition_params}, + return sync_execute( + SELECT_EVENT_BY_TEAM_AND_CONDITIONS_SQL.format(conditions=conditions, limit=limit_sql, order=order), + {"team_id": team.pk, "limit": limit, **condition_params}, ) - result = ClickhouseEventSerializer( - query_result[0:100], many=True, context={"people": self._get_people(query_result, team),}, - ).data + def list(self, request: Request, *args: Any, **kwargs: Any) -> Response: + is_csv_request = self.request.accepted_renderer.format == "csv" - if len(query_result) > 100: - path = request.get_full_path() - reverse = request.GET.get("orderBy", "-timestamp") != "-timestamp" - next_url: Optional[str] = request.build_absolute_uri( - "{}{}{}={}".format( - path, - "&" if "?" in path else "?", - "after" if reverse else "before", - query_result[99][3].strftime("%Y-%m-%dT%H:%M:%S.%fZ"), - ) - ) + if self.request.GET.get("limit", None): + limit = int(self.request.GET.get("limit")) # type: ignore + elif is_csv_request: + limit = self.CSV_EXPORT_DEFAULT_LIMIT else: - next_url = None + limit = 100 - return Response({"next": next_url, "results": result}) + if is_csv_request: + limit = min(limit, self.CSV_EXPORT_MAXIMUM_LIMIT) - def retrieve(self, request: Request, pk: Optional[int] = None, *args: Any, **kwargs: Any) -> Response: + team = self.team + filter = Filter(request=request, team=self.team) - # TODO: implement getting elements - team = request.user.team - query_result = sync_execute(SELECT_ONE_EVENT_SQL, {"team_id": team.pk, "event_id": pk},) - result = ClickhouseEventSerializer(query_result[0], many=False).data + query_result = self._query_events_list(filter, team, request, limit=limit) - return Response(result) + # Retry the query without the 1 day optimization + if len(query_result) < limit and not request.GET.get("after"): + query_result = self._query_events_list(filter, team, request, long_date_from=True, limit=limit) - @action(methods=["GET"], detail=False) - def values(self, request: Request) -> Response: + result = ClickhouseEventSerializer( + query_result[0:limit], many=True, context={"people": self._get_people(query_result, team),}, + ).data + + next_url: Optional[str] = None + if not is_csv_request and len(query_result) > limit: + next_url = self._build_next_url(request, query_result[limit - 1][3]) + + return Response({"next": next_url, "results": result}) + def retrieve(self, request: Request, pk: Optional[Union[int, str]] = None, *args: Any, **kwargs: Any) -> Response: + if not isinstance(pk, str) or not UUIDT.is_valid_uuid(pk): + return Response({"detail": "Invalid UUID", "code": "invalid", "type": "validation_error",}, status=400) + query_result = sync_execute(SELECT_ONE_EVENT_SQL, {"team_id": self.team.pk, "event_id": pk.replace("-", "")}) + if len(query_result) == 0: + raise NotFound(detail=f"No events exist for event UUID {pk}") + res = ClickhouseEventSerializer(query_result[0], many=False).data + return Response(res) + + @action(methods=["GET"], detail=False) + def values(self, request: Request, **kwargs) -> Response: # type: ignore key = request.GET.get("key") - team = request.user.team + team = self.team result = [] - if key: + flattened = [] + if key == "custom_event": + events = sync_execute(GET_CUSTOM_EVENTS, {"team_id": team.pk}) + return Response([{"name": event[0]} for event in events]) + elif key: result = get_property_values_for_key(key, team, value=request.GET.get("value")) - return Response([{"name": convert_property_value(value[0])} for value in result]) + for value in result: + try: + # Try loading as json for dicts or arrays + flattened.append(json.loads(value[0])) + except json.decoder.JSONDecodeError: + flattened.append(value[0]) + return Response([{"name": convert_property_value(value)} for value in flatten(flattened)]) + + @action(methods=["GET"], detail=False) + def sessions(self, request: Request, *args: Any, **kwargs: Any) -> Response: # type: ignore + filter = SessionsFilter(request=request, team=self.team) + + sessions, pagination = ClickhouseSessionsList.run(team=self.team, filter=filter) + return Response({"result": sessions, "pagination": pagination}) + + @action(methods=["GET"], detail=False) + def session_events(self, request: Request, *args: Any, **kwargs: Any) -> Response: # type: ignore + from ee.clickhouse.queries.sessions.events import SessionsListEvents + + filter = SessionEventsFilter(request=request, team=self.team) + return Response({"result": SessionsListEvents().run(filter=filter, team=self.team)}) # ****************************************** - # /event/session_recording + # /events/session_recording # params: # - session_recording_id: (string) id of the session recording + # - save_view: (boolean) save view of the recording # ****************************************** @action(methods=["GET"], detail=False) - def session_recording(self, request: Request, *args: Any, **kwargs: Any) -> Response: - team = self.request.user.team - snapshots = SessionRecording().run( - team=team, filter=Filter(request=request), session_recording_id=request.GET.get("session_recording_id") - ) + def session_recording(self, request: Request, *args: Any, **kwargs: Any) -> Response: # type: ignore + if not request.GET.get("session_recording_id"): + return Response( + { + "detail": "The query parameter session_recording_id is required for this endpoint.", + "type": "validation_error", + "code": "invalid", + }, + status=400, + ) + + session_recording = SessionRecording( + request=request, + team=self.team, + filter=Filter(request=request, team=self.team), + session_recording_id=request.GET["session_recording_id"], + ).run() + + if request.GET.get("save_view"): + SessionRecordingViewed.objects.get_or_create( + team=self.team, user=request.user, session_id=request.GET["session_recording_id"] + ) + + return Response({"result": session_recording}) + - return Response({"result": snapshots}) +class LegacyClickhouseEventsViewSet(ClickhouseEventsViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/groups.py b/ee/clickhouse/views/groups.py new file mode 100644 index 0000000000000..2bd3ca214bb18 --- /dev/null +++ b/ee/clickhouse/views/groups.py @@ -0,0 +1,43 @@ +import json +from collections import defaultdict + +from rest_framework import exceptions, request, response, serializers, viewsets +from rest_framework.decorators import action +from rest_framework.mixins import ListModelMixin, RetrieveModelMixin + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.sql.person import GET_TEAM_PERSON_DISTINCT_IDS +from posthog.api.routing import StructuredViewSetMixin +from posthog.models.group_type_mapping import GroupTypeMapping + + +class GroupTypeSerializer(serializers.ModelSerializer): + class Meta: + model = GroupTypeMapping + fields = ["group_type", "group_type_index"] + + +class ClickhouseGroupsView(StructuredViewSetMixin, ListModelMixin, viewsets.GenericViewSet): + serializer_class = GroupTypeSerializer + queryset = GroupTypeMapping.objects.all() + pagination_class = None + + @action(methods=["GET"], detail=False) + def property_definitions(self, request: request.Request, **kw): + rows = sync_execute( + f""" + SELECT group_type_index, tupleElement(keysAndValues, 1) as key, count(*) as count + FROM groups + ARRAY JOIN JSONExtractKeysAndValuesRaw(group_properties) as keysAndValues + WHERE team_id = %(team_id)s + GROUP BY group_type_index, tupleElement(keysAndValues, 1) + ORDER BY group_type_index ASC, count DESC, key ASC + """, + {"team_id": self.team.pk}, + ) + + group_type_index_to_properties = defaultdict(list) + for group_type_index, key, count in rows: + group_type_index_to_properties[group_type_index].append({"name": key, "count": count}) + + return response.Response(group_type_index_to_properties) diff --git a/ee/clickhouse/views/insights.py b/ee/clickhouse/views/insights.py index 5ff4394a92edd..92c3b6b1cbbcb 100644 --- a/ee/clickhouse/views/insights.py +++ b/ee/clickhouse/views/insights.py @@ -1,82 +1,147 @@ -from typing import Any +import json +from typing import Any, Dict, Type from rest_framework.decorators import action from rest_framework.request import Request from rest_framework.response import Response -from ee.clickhouse.models.person import get_persons_by_distinct_ids -from ee.clickhouse.queries.clickhouse_funnel import ClickhouseFunnel -from ee.clickhouse.queries.clickhouse_paths import ClickhousePaths +from ee.clickhouse.queries import ClickhousePaths from ee.clickhouse.queries.clickhouse_retention import ClickhouseRetention from ee.clickhouse.queries.clickhouse_stickiness import ClickhouseStickiness +from ee.clickhouse.queries.funnels import ( + ClickhouseFunnel, + ClickhouseFunnelBase, + ClickhouseFunnelStrict, + ClickhouseFunnelTimeToConvert, + ClickhouseFunnelTrends, + ClickhouseFunnelUnordered, +) +from ee.clickhouse.queries.funnels.funnel_correlation import FunnelCorrelation from ee.clickhouse.queries.sessions.clickhouse_sessions import ClickhouseSessions -from ee.clickhouse.queries.sessions.list import SESSIONS_LIST_DEFAULT_LIMIT from ee.clickhouse.queries.trends.clickhouse_trends import ClickhouseTrends +from ee.clickhouse.queries.util import get_earliest_timestamp from posthog.api.insight import InsightViewSet -from posthog.constants import TRENDS_STICKINESS -from posthog.models.filter import Filter - - -class ClickhouseInsights(InsightViewSet): - @action(methods=["GET"], detail=False) - def trend(self, request: Request, *args: Any, **kwargs: Any) -> Response: - - team = request.user.team - filter = Filter(request=request) - - if filter.shown_as == TRENDS_STICKINESS: - result = ClickhouseStickiness().run(filter, team) +from posthog.constants import ( + INSIGHT_FUNNELS, + INSIGHT_PATHS, + INSIGHT_SESSIONS, + INSIGHT_STICKINESS, + PATHS_INCLUDE_EVENT_TYPES, + TRENDS_STICKINESS, + FunnelOrderType, + FunnelVizType, +) +from posthog.decorators import cached_function +from posthog.models.filters import Filter +from posthog.models.filters.path_filter import PathFilter +from posthog.models.filters.retention_filter import RetentionFilter +from posthog.models.filters.sessions_filter import SessionsFilter +from posthog.models.filters.stickiness_filter import StickinessFilter + + +class ClickhouseInsightsViewSet(InsightViewSet): + @cached_function + def calculate_trends(self, request: Request) -> Dict[str, Any]: + team = self.team + filter = Filter(request=request, team=self.team) + + if filter.insight == INSIGHT_STICKINESS or filter.shown_as == TRENDS_STICKINESS: + stickiness_filter = StickinessFilter( + request=request, team=team, get_earliest_timestamp=get_earliest_timestamp + ) + result = ClickhouseStickiness().run(stickiness_filter, team) else: - result = ClickhouseTrends().run(filter, team) + trends_query = ClickhouseTrends() + result = trends_query.run(filter, team) self._refresh_dashboard(request=request) - + return {"result": result} + + @cached_function + def calculate_session(self, request: Request) -> Dict[str, Any]: + return { + "result": ClickhouseSessions().run( + team=self.team, + filter=SessionsFilter(request=request, data={"insight": INSIGHT_SESSIONS}, team=self.team), + ) + } + + @cached_function + def calculate_path(self, request: Request) -> Dict[str, Any]: + team = self.team + filter = PathFilter(request=request, data={"insight": INSIGHT_PATHS}, team=self.team) + + funnel_filter = None + funnel_filter_data = request.GET.get("funnel_filter") or request.data.get("funnel_filter") + if funnel_filter_data: + if isinstance(funnel_filter_data, str): + funnel_filter_data = json.loads(funnel_filter_data) + funnel_filter = Filter(data={"insight": INSIGHT_FUNNELS, **funnel_filter_data}, team=self.team) + + #  backwards compatibility + if filter.path_type: + filter = filter.with_data({PATHS_INCLUDE_EVENT_TYPES: [filter.path_type]}) + resp = ClickhousePaths(filter=filter, team=team, funnel_filter=funnel_filter).run() + + return {"result": resp} + + @cached_function + def calculate_funnel(self, request: Request) -> Dict[str, Any]: + team = self.team + filter = Filter(request=request, data={"insight": INSIGHT_FUNNELS}, team=self.team) + + funnel_order_class: Type[ClickhouseFunnelBase] = ClickhouseFunnel + if filter.funnel_order_type == FunnelOrderType.UNORDERED: + funnel_order_class = ClickhouseFunnelUnordered + elif filter.funnel_order_type == FunnelOrderType.STRICT: + funnel_order_class = ClickhouseFunnelStrict + + if filter.funnel_viz_type == FunnelVizType.TRENDS: + return { + "result": ClickhouseFunnelTrends(team=team, filter=filter, funnel_order_class=funnel_order_class).run() + } + elif filter.funnel_viz_type == FunnelVizType.TIME_TO_CONVERT: + return { + "result": ClickhouseFunnelTimeToConvert( + team=team, filter=filter, funnel_order_class=funnel_order_class + ).run() + } + else: + return {"result": funnel_order_class(team=team, filter=filter).run()} + + # ****************************************** + # /projects/:id/insights/funnel/correlation + # + # params: + # - params are the same as for funnel + # + # Returns significant events, i.e. those that are correlated with a person + # making it through a funnel + # ****************************************** + @action(methods=["GET", "POST"], url_path="funnel/correlation", detail=False) + def funnel_correlation(self, request: Request, *args: Any, **kwargs: Any) -> Response: + result = self.calculate_funnel_correlation(request) return Response(result) - @action(methods=["GET"], detail=False) - def session(self, request: Request, *args: Any, **kwargs: Any) -> Response: - - team = request.user.team + @cached_function + def calculate_funnel_correlation(self, request: Request) -> Dict[str, Any]: + team = self.team filter = Filter(request=request) - limit = int(request.GET.get("limit", SESSIONS_LIST_DEFAULT_LIMIT)) - offset = int(request.GET.get("offset", 0)) + result = FunnelCorrelation(filter=filter, team=team).run() - response = ClickhouseSessions().run(team=team, filter=filter, limit=limit + 1, offset=offset) + return {"result": result} - if "distinct_id" in request.GET and request.GET["distinct_id"]: - try: - person_ids = get_persons_by_distinct_ids(team.pk, [request.GET["distinct_id"]])[0].distinct_ids - response = [session for i, session in enumerate(response) if response[i]["distinct_id"] in person_ids] - except IndexError: - response = [] - - if len(response) > limit: - response.pop() - return Response({"result": response, "offset": offset + limit}) - else: - return Response({"result": response,}) - - @action(methods=["GET"], detail=False) - def path(self, request: Request, *args: Any, **kwargs: Any) -> Response: - - team = request.user.team - filter = Filter(request=request) - resp = ClickhousePaths().run(filter=filter, team=team) - return Response(resp) - - @action(methods=["GET"], detail=False) - def funnel(self, request: Request, *args: Any, **kwargs: Any) -> Response: - - team = request.user.team - filter = Filter(request=request) - response = ClickhouseFunnel(team=team, filter=filter).run() - return Response(response) + @cached_function + def calculate_retention(self, request: Request) -> Dict[str, Any]: + team = self.team + data = {} + if not request.GET.get("date_from"): + data.update({"date_from": "-11d"}) + filter = RetentionFilter(data=data, request=request, team=self.team) + result = ClickhouseRetention().run(filter, team) + return {"result": result} - @action(methods=["GET"], detail=False) - def retention(self, request: Request, *args: Any, **kwargs: Any) -> Response: - team = request.user.team - filter = Filter(request=request) - result = ClickhouseRetention().run(filter, team) - return Response({"data": result}) +class LegacyClickhouseInsightsViewSet(ClickhouseInsightsViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/paths.py b/ee/clickhouse/views/paths.py index c17c632517038..689314e1d2760 100644 --- a/ee/clickhouse/views/paths.py +++ b/ee/clickhouse/views/paths.py @@ -4,17 +4,17 @@ from rest_framework.response import Response from ee.clickhouse.client import sync_execute -from ee.clickhouse.queries.clickhouse_paths import ClickhousePaths +from ee.clickhouse.queries import ClickhousePaths from ee.clickhouse.sql.events import ELEMENT_TAG_COUNT from posthog.api.paths import PathsViewSet from posthog.models import Event, Filter +from posthog.models.filters.path_filter import PathFilter class ClickhousePathsViewSet(PathsViewSet): @action(methods=["GET"], detail=False) - def elements(self, request: request.Request): - - team = request.user.team + def elements(self, request: request.Request, **kwargs): # type: ignore + team = self.team response = sync_execute(ELEMENT_TAG_COUNT, {"team_id": team.pk, "limit": 20}) resp = [] @@ -23,11 +23,6 @@ def elements(self, request: request.Request): return Response(resp) - # FIXME: Timestamp is timezone aware timestamp, date range uses naive date. - # To avoid unexpected results should convert date range to timestamps with timezone. - def list(self, request): - team = request.user.team - filter = Filter(request=request) - resp = ClickhousePaths().run(filter=filter, team=team) - return Response(resp) +class LegacyClickhousePathsViewSet(ClickhousePathsViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/person.py b/ee/clickhouse/views/person.py index da2112d3bbf6d..04824692b45b9 100644 --- a/ee/clickhouse/views/person.py +++ b/ee/clickhouse/views/person.py @@ -1,22 +1,191 @@ import json -from typing import List +from typing import Callable, Dict, List, Optional, Tuple -from rest_framework import request, response +from rest_framework.decorators import action +from rest_framework.exceptions import NotFound +from rest_framework.request import Request +from rest_framework.response import Response +from rest_framework.utils.serializer_helpers import ReturnDict, ReturnList +from ee.clickhouse.client import sync_execute from ee.clickhouse.models.person import delete_person +from ee.clickhouse.queries.clickhouse_retention import ClickhouseRetention +from ee.clickhouse.queries.clickhouse_stickiness import ClickhouseStickiness +from ee.clickhouse.queries.funnels import ClickhouseFunnelPersons, ClickhouseFunnelTrendsPersons +from ee.clickhouse.queries.funnels.funnel_correlation_persons import FunnelCorrelationPersons +from ee.clickhouse.queries.paths import ClickhousePathsPersons +from ee.clickhouse.queries.trends.lifecycle import ClickhouseLifecycle +from ee.clickhouse.sql.person import GET_PERSON_PROPERTIES_COUNT from posthog.api.person import PersonViewSet -from posthog.models import Event, Person +from posthog.constants import ( + FUNNEL_CORRELATION_PERSON_LIMIT, + FUNNEL_CORRELATION_PERSON_OFFSET, + INSIGHT_FUNNELS, + INSIGHT_PATHS, + FunnelVizType, +) +from posthog.decorators import cached_function +from posthog.models import Event, Filter, Person +from posthog.models.filters.path_filter import PathFilter +from posthog.utils import format_query_params_absolute_url -# TODO: Move grabbing all this to Clickhouse. See WIP-people-from-clickhouse branch. -class ClickhousePerson(PersonViewSet): - def destroy(self, request: request.Request, pk=None): # type: ignore - team = request.user.team - person = Person.objects.get(team=team, pk=pk) - # TODO: Probably won't need this after a while +class ClickhousePersonViewSet(PersonViewSet): + lifecycle_class = ClickhouseLifecycle + retention_class = ClickhouseRetention + stickiness_class = ClickhouseStickiness - events = Event.objects.filter(team=team, distinct_id__in=person.distinct_ids) - events.delete() - delete_person(person.uuid, delete_events=True, team_id=team.pk) - person.delete() - return response.Response(status=204) + @action(methods=["GET", "POST"], detail=False) + def funnel(self, request: Request, **kwargs) -> Response: + if request.user.is_anonymous or not self.team: + return Response(data=[]) + + results_package = self.calculate_funnel_persons(request) + + if not results_package: + return Response(data=[]) + + people, next_url, initial_url = results_package["result"] + + return Response( + data={ + "results": [{"people": people, "count": len(people)}], + "next": next_url, + "initial": initial_url, + "is_cached": results_package.get("is_cached"), + "last_refresh": results_package.get("last_refresh"), + } + ) + + @cached_function + def calculate_funnel_persons(self, request: Request) -> Dict[str, Tuple[list, Optional[str], Optional[str]]]: + if request.user.is_anonymous or not self.team: + return {"result": ([], None, None)} + + filter = Filter(request=request, data={"insight": INSIGHT_FUNNELS}, team=self.team) + funnel_class: Callable = ClickhouseFunnelPersons + + if filter.funnel_viz_type == FunnelVizType.TRENDS: + funnel_class = ClickhouseFunnelTrendsPersons + + people, should_paginate = funnel_class(filter, self.team).run() + limit = filter.limit if filter.limit else 100 + next_url = format_query_params_absolute_url(request, filter.offset + limit) if should_paginate else None + initial_url = format_query_params_absolute_url(request, 0) + + # cached_function expects a dict with the key result + return {"result": (people, next_url, initial_url)} + + @action(methods=["GET", "POST"], url_path="funnel/correlation", detail=False) + def funnel_correlation(self, request: Request, **kwargs) -> Response: + if request.user.is_anonymous or not self.team: + return Response(data=[]) + + results_package = self.calculate_funnel_correlation_persons(request) + + if not results_package: + return Response(data=[]) + + people, next_url, initial_url = results_package["result"] + + return Response( + data={ + "results": [{"people": people, "count": len(people)}], + "next": next_url, + "initial": initial_url, + "is_cached": results_package.get("is_cached"), + "last_refresh": results_package.get("last_refresh"), + } + ) + + @cached_function + def calculate_funnel_correlation_persons( + self, request: Request + ) -> Dict[str, Tuple[list, Optional[str], Optional[str]]]: + if request.user.is_anonymous or not self.team: + return {"result": ([], None, None)} + + filter = Filter(request=request, data={"insight": INSIGHT_FUNNELS}, team=self.team) + people, should_paginate = FunnelCorrelationPersons(filter=filter, team=self.team).run() + + limit = filter.correlation_person_limit if filter.correlation_person_limit else 100 + next_url = ( + format_query_params_absolute_url( + request, + filter.correlation_person_offset + limit, + offset_alias=FUNNEL_CORRELATION_PERSON_OFFSET, + limit_alias=FUNNEL_CORRELATION_PERSON_LIMIT, + ) + if should_paginate + else None + ) + initial_url = format_query_params_absolute_url(request, 0) + + # cached_function expects a dict with the key result + return {"result": (people, next_url, initial_url)} + + def get_properties(self, request: Request): + rows = sync_execute(GET_PERSON_PROPERTIES_COUNT, {"team_id": self.team.pk}) + return [{"name": name, "count": count} for name, count in rows] + + @action(methods=["GET", "POST"], detail=False) + def path(self, request: Request, **kwargs) -> Response: + if request.user.is_anonymous or not self.team: + return Response(data=[]) + + results_package = self.calculate_path_persons(request) + + if not results_package: + return Response(data=[]) + + people, next_url, initial_url = results_package["result"] + + return Response( + data={ + "results": [{"people": people, "count": len(people)}], + "next": next_url, + "initial": initial_url, + "is_cached": results_package.get("is_cached"), + "last_refresh": results_package.get("last_refresh"), + } + ) + + @cached_function + def calculate_path_persons(self, request: Request) -> Dict[str, Tuple[list, Optional[str], Optional[str]]]: + if request.user.is_anonymous or not self.team: + return {"result": ([], None, None)} + + filter = PathFilter(request=request, data={"insight": INSIGHT_PATHS}, team=self.team) + + funnel_filter = None + funnel_filter_data = request.GET.get("funnel_filter") or request.data.get("funnel_filter") + if funnel_filter_data: + if isinstance(funnel_filter_data, str): + funnel_filter_data = json.loads(funnel_filter_data) + funnel_filter = Filter(data={"insight": INSIGHT_FUNNELS, **funnel_filter_data}, team=self.team) + + people, should_paginate = ClickhousePathsPersons(filter, self.team, funnel_filter=funnel_filter).run() + limit = filter.limit or 100 + next_url = format_query_params_absolute_url(request, filter.offset + limit) if should_paginate else None + initial_url = format_query_params_absolute_url(request, 0) + + # cached_function expects a dict with the key result + return {"result": (people, next_url, initial_url)} + + def destroy(self, request: Request, pk=None, **kwargs): # type: ignore + try: + person = Person.objects.get(team=self.team, pk=pk) + + events = Event.objects.filter(team=self.team, distinct_id__in=person.distinct_ids) + events.delete() + delete_person( + person.uuid, person.properties, person.is_identified, delete_events=True, team_id=self.team.pk + ) + person.delete() + return Response(status=204) + except Person.DoesNotExist: + raise NotFound(detail="Person not found.") + + +class LegacyClickhousePersonViewSet(ClickhousePersonViewSet): + legacy_team_compatibility = True diff --git a/ee/clickhouse/views/session_recordings.py b/ee/clickhouse/views/session_recordings.py new file mode 100644 index 0000000000000..bb2b1a699e22b --- /dev/null +++ b/ee/clickhouse/views/session_recordings.py @@ -0,0 +1,18 @@ +from ee.clickhouse.queries.session_recordings.clickhouse_session_recording import ClickhouseSessionRecording +from ee.clickhouse.queries.session_recordings.clickhouse_session_recording_list import ClickhouseSessionRecordingList +from posthog.api.session_recording import SessionRecordingViewSet + + +class ClickhouseSessionRecordingViewSet(SessionRecordingViewSet): + def _get_session_recording_list(self, filter): + return ClickhouseSessionRecordingList(filter=filter, team=self.team).run() + + def _get_session_recording_snapshots(self, request, filter, session_recording_id): + return ClickhouseSessionRecording( + request=request, filter=filter, team=self.team, session_recording_id=session_recording_id + ).get_snapshots() + + def _get_session_recording_meta_data(self, request, filter, session_recording_id): + return ClickhouseSessionRecording( + request=request, filter=filter, team=self.team, session_recording_id=session_recording_id + ).get_metadata() diff --git a/ee/clickhouse/views/test/test_clickhouse_action.py b/ee/clickhouse/views/test/test_clickhouse_action.py index 19e8134459dc0..1bd1bd94b904f 100644 --- a/ee/clickhouse/views/test/test_clickhouse_action.py +++ b/ee/clickhouse/views/test/test_clickhouse_action.py @@ -1,13 +1,14 @@ from unittest.mock import patch from uuid import uuid4 +from rest_framework import status + from ee.clickhouse.models.event import create_event from ee.clickhouse.util import ClickhouseTestMixin +from posthog.api.test.test_action import factory_test_action_api from posthog.api.test.test_action_people import action_people_test_factory -from posthog.models import Action, ActionStep -from posthog.models.cohort import Cohort -from posthog.models.person import Person -from posthog.models.team import Team +from posthog.constants import ENTITY_ID, ENTITY_MATH, ENTITY_TYPE, TRENDS_CUMULATIVE +from posthog.models import Action, ActionStep, Cohort, Organization, Person def _create_action(**kwargs): @@ -36,39 +37,303 @@ def _create_event(**kwargs): create_event(**kwargs) -class TestAction( +class TestActionApi(ClickhouseTestMixin, factory_test_action_api(_create_event)): # type: ignore + pass + + +class TestActionPeople( ClickhouseTestMixin, action_people_test_factory(_create_event, _create_person, _create_action, _create_cohort) # type: ignore ): - @patch("posthog.tasks.calculate_action.calculate_action.delay") - def test_is_calculating_always_false(self, patch_delay): - create = self.client.post("/api/action/", data={"name": "ooh",}, content_type="application/json",).json() - self.assertEqual(create["is_calculating"], False) - self.assertFalse(patch_delay.called) + @patch("posthog.models.action.Action.calculate_events") + def test_is_calculating_always_false(self, calculate_events): + create_response_wrapper = self.client.post(f"/api/projects/{self.team.id}/actions/", {"name": "ooh"}) + create_response = create_response_wrapper.json() + self.assertEqual(create_response_wrapper.status_code, status.HTTP_201_CREATED) + self.assertEqual(create_response["is_calculating"], False) + self.assertFalse(calculate_events.called) - response = self.client.get("/api/action/").json() + response = self.client.get(f"/api/projects/{self.team.id}/actions/").json() self.assertEqual(response["results"][0]["is_calculating"], False) - response = self.client.get("/api/action/%s/" % create["id"]).json() + response = self.client.get(f"/api/projects/{self.team.id}/actions/{create_response['id']}/").json() self.assertEqual(response["is_calculating"], False) # Make sure we're not re-calculating actions - response = self.client.patch( - "/api/action/%s/" % create["id"], data={"name": "ooh",}, content_type="application/json", + response = self.client.patch(f"/api/projects/{self.team.id}/actions/{create_response['id']}/", {"name": "ooh"}) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.json()["name"], "ooh") + self.assertEqual(response.json()["is_calculating"], False) + self.assertFalse(calculate_events.called) + + def test_active_user_weekly_people(self): + p1 = _create_person(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-10T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = _create_person(team_id=self.team.pk, distinct_ids=["p2"], properties={"name": "p2"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + people = self.client.get( + f"/api/projects/{self.team.id}/actions/people/", + data={ + "date_from": "2020-01-10", + "date_to": "2020-01-10", + ENTITY_TYPE: "events", + ENTITY_ID: "$pageview", + ENTITY_MATH: "weekly_active", + }, ).json() - self.assertEqual(response["name"], "ooh") - self.assertEqual(response["is_calculating"], False) - self.assertFalse(patch_delay.called) - - def test_only_get_count_on_retrieve(self): - team2 = Team.objects.create(name="bla") - action = Action.objects.create(team=self.team, name="bla") - ActionStep.objects.create(action=action, event="custom event") - _create_event(event="custom event", team=self.team, distinct_id="test") - _create_event(event="another event", team=self.team, distinct_id="test") - # test team leakage - _create_event(event="custom event", team=team2, distinct_id="test") - response = self.client.get("/api/action/").json() - self.assertEqual(response["results"][0]["count"], None) - - response = self.client.get("/api/action/%s/" % action.pk).json() - self.assertEqual(response["count"], 1) + self.assertEqual(len(people["results"][0]["people"]), 2) + + def test_breakdown_by_person_property_nones_people_endpoint(self): + p1 = _create_person(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-10T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = _create_person(team_id=self.team.pk, distinct_ids=["p2"], properties={}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-10T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + people = self.client.get( + f"/api/projects/{self.team.id}/actions/people/", + data={ + "date_from": "2020-01-10", + "date_to": "2020-01-10", + ENTITY_TYPE: "events", + ENTITY_ID: "$pageview", + "breakdown_type": "person", + "breakdown_value": "p1", + "breakdown": "name", + }, + ).json() + self.assertEqual(len(people["results"][0]["people"]), 1) + + people = self.client.get( + f"/api/projects/{self.team.id}/actions/people/", + data={ + "date_from": "2020-01-10", + "date_to": "2020-01-10", + ENTITY_TYPE: "events", + ENTITY_ID: "$pageview", + "breakdown_type": "person", + "breakdown_value": "", + "breakdown": "name", + }, + ).json() + self.assertEqual(len(people["results"][0]["people"]), 1) + + def test_breakdown_by_event_property_none_people_endpoint(self): + p1 = _create_person(team_id=self.team.pk, distinct_ids=["p1"], properties={"name": "p1"}) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-09T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-10T12:00:00Z", + properties={"key": "val"}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p1", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + p2 = _create_person(team_id=self.team.pk, distinct_ids=["p2"], properties={"name": "p2"}) + _create_event( + team=self.team, event="$pageview", distinct_id="p2", timestamp="2020-01-09T12:00:00Z", properties={}, + ) + _create_event( + team=self.team, + event="$pageview", + distinct_id="p2", + timestamp="2020-01-11T12:00:00Z", + properties={"key": "val"}, + ) + + people = self.client.get( + f"/api/projects/{self.team.id}/actions/people/", + data={ + "date_from": "2020-01-8", + "date_to": "2020-01-12", + ENTITY_TYPE: "events", + ENTITY_ID: "$pageview", + "display": TRENDS_CUMULATIVE, # ensure that the date range is used as is + "breakdown_type": "event", + "breakdown_value": "val", + "breakdown": "key", + }, + ).json() + self.assertEqual(len(people["results"][0]["people"]), 2) + + people = self.client.get( + f"/api/projects/{self.team.id}/actions/people/", + data={ + "date_from": "2020-01-08", + "date_to": "2020-01-12", + ENTITY_TYPE: "events", + ENTITY_ID: "$pageview", + "display": TRENDS_CUMULATIVE, # ensure that the date range is used as is + "breakdown_type": "event", + "breakdown_value": "", + "breakdown": "key", + }, + ).json() + self.assertEqual(len(people["results"][0]["people"]), 1) + + def _test_interval(self, date_from, interval, timestamps): + for index, ts in enumerate(timestamps): + _create_person(team_id=self.team.pk, distinct_ids=[f"person{index}"]) + _create_event( + team=self.team, + event="watched movie", + distinct_id=f"person{index}", + timestamp=ts, + properties={"event_prop": f"prop{index}"}, + ) + + people = self.client.get( + f"/api/projects/{self.team.id}/actions/people/", + data={"interval": interval, "date_from": date_from, ENTITY_TYPE: "events", ENTITY_ID: "watched movie"}, + ).json() + + self.assertCountEqual( + [person["distinct_ids"][0] for person in people["results"][0]["people"]], ["person1", "person2"] + ) + + def test_interval_month(self): + self._test_interval( + date_from="2021-08-01T00:00:00Z", + interval="month", + timestamps=[ + "2021-07-31T23:45:00Z", + "2021-08-01T00:12:00Z", + "2021-08-31T22:40:00Z", + "2021-09-01T00:00:10Z", + ], + ) + + def test_interval_week(self): + self._test_interval( + date_from="2021-09-05T00:00:00Z", + interval="week", + timestamps=[ + "2021-09-04T23:45:00Z", + "2021-09-05T00:12:00Z", + "2021-09-11T22:40:00Z", + "2021-09-12T00:00:10Z", + ], + ) + + def test_interval_day(self): + self._test_interval( + date_from="2021-09-05T00:00:00Z", + interval="day", + timestamps=[ + "2021-09-04T23:45:00Z", + "2021-09-05T00:12:00Z", + "2021-09-05T22:40:00Z", + "2021-09-06T00:00:10Z", + ], + ) + + def test_interval_hour(self): + self._test_interval( + date_from="2021-09-05T16:00:00Z", + interval="hour", + timestamps=[ + "2021-09-05T15:45:00Z", + "2021-09-05T16:01:12Z", + "2021-09-05T16:58:00Z", + "2021-09-05T17:00:10Z", + ], + ) + + def test_interval_minute(self): + self._test_interval( + date_from="2021-09-05T16:05:00Z", + interval="minute", + timestamps=[ + "2021-09-05T16:04:55Z", + "2021-09-05T16:05:12Z", + "2021-09-05T16:05:58Z", + "2021-09-05T16:06:10Z", + ], + ) diff --git a/ee/clickhouse/views/test/test_clickhouse_element.py b/ee/clickhouse/views/test/test_clickhouse_element.py index cc89e3fcc6832..a4b2b084697f7 100644 --- a/ee/clickhouse/views/test/test_clickhouse_element.py +++ b/ee/clickhouse/views/test/test_clickhouse_element.py @@ -2,7 +2,7 @@ from ee.clickhouse.models.event import create_event from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.test_element import test_element_factory +from posthog.api.test.test_element import factory_test_element from posthog.models import Event @@ -12,6 +12,6 @@ def _create_event(**kwargs): class TestElement( - ClickhouseTestMixin, test_element_factory(_create_event) # type: ignore + ClickhouseTestMixin, factory_test_element(_create_event) # type: ignore ): pass diff --git a/ee/clickhouse/views/test/test_clickhouse_event.py b/ee/clickhouse/views/test/test_clickhouse_event.py index b8551d9ac9059..c26d323d76c75 100644 --- a/ee/clickhouse/views/test/test_clickhouse_event.py +++ b/ee/clickhouse/views/test/test_clickhouse_event.py @@ -1,8 +1,11 @@ +from unittest.mock import patch from uuid import uuid4 +from django.utils import timezone + from ee.clickhouse.models.event import create_event from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.test_event import test_event_api_factory +from posthog.api.test.test_event import factory_test_event_api from posthog.models import Action, ActionStep, Event, Person @@ -24,7 +27,20 @@ def _create_person(**kwargs): class ClickhouseTestEventApi( - ClickhouseTestMixin, test_event_api_factory(_create_event, _create_person, _create_action) # type: ignore + ClickhouseTestMixin, factory_test_event_api(_create_event, _create_person, _create_action) # type: ignore ): def test_live_action_events(self): pass + + @patch("ee.clickhouse.views.events.sync_execute") + def test_optimize_query(self, patch_sync_execute): + #  For ClickHouse we normally only query the last day, + # but if a user doesn't have many events we still want to return events that are older + patch_sync_execute.return_value = [("event", "d", "{}", timezone.now(), "d", "d", "d")] + response = self.client.get(f"/api/projects/{self.team.id}/events/").json() + self.assertEqual(len(response["results"]), 1) + self.assertEqual(patch_sync_execute.call_count, 2) + + patch_sync_execute.return_value = [("event", "d", "{}", timezone.now(), "d", "d", "d") for _ in range(0, 100)] + response = self.client.get(f"/api/projects/{self.team.id}/events/").json() + self.assertEqual(patch_sync_execute.call_count, 3) diff --git a/ee/clickhouse/views/test/test_clickhouse_funnel_correlation.py b/ee/clickhouse/views/test/test_clickhouse_funnel_correlation.py new file mode 100644 index 0000000000000..d6ca6cfcfc825 --- /dev/null +++ b/ee/clickhouse/views/test/test_clickhouse_funnel_correlation.py @@ -0,0 +1,503 @@ +import dataclasses +import json +from datetime import datetime +from typing import Any, Dict, List, Optional, TypedDict +from uuid import uuid4 + +import pytest +from django.core.cache import cache +from django.test import Client +from freezegun import freeze_time + +from ee.clickhouse.models.event import create_event +from posthog.constants import FunnelCorrelationType +from posthog.models.element import Element +from posthog.models.person import Person +from posthog.models.team import Team +from posthog.test.base import BaseTest + + +@pytest.mark.clickhouse_only +class FunnelCorrelationTest(BaseTest): + """ + Tests for /api/projects/:project_id/funnel/correlation/ + """ + + CLASS_DATA_LEVEL_SETUP = False + + def test_requires_authn(self): + response = get_funnel_correlation( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest(date_to="2020-04-04", events=json.dumps([])), + ) + assert response.status_code == 401 + + def test_event_correlation_endpoint_picks_up_events_for_odds_ratios(self): + with freeze_time("2020-01-01"): + self.client.force_login(self.user) + + # Add in two people: + # + # Person 1 - a single signup event + # Person 2 - a signup event and a view insights event + # + # Both of them have a "watched video" event + # + # We then create Person 3, one successful, the other + # not. Both have not watched the video. + # + # So our contingency table for "watched video" should be + # + # | | success | failure | total | + # | ---------------- | -------- | -------- | -------- | + # | watched | 1 | 1 | 2 | + # | did not watched | 1 | 0 | 1 | + # | total | 2 | 1 | 3 | + # + # For Calculating Odds Ratio, we add a prior count of 1 to everything + # + # So our odds ratio should be + # (success + prior / failure + prior) * (failure_total - failure + prior / success_total - success + prior) + # = ( 1 + 1 / 1 + 1) * ( 1 - 1 + 1 / 2 - 1 + 1) + # = 1 / 2 + + events = { + "Person 1": [ + #  Failure / watched + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "watched video", "timestamp": datetime(2020, 1, 2)}, + ], + "Person 2": [ + #  Success / watched + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "watched video", "timestamp": datetime(2020, 1, 2)}, + {"event": "view insights", "timestamp": datetime(2020, 1, 3)}, + ], + "Person 3": [ + # Success / did not watched + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "view insights", "timestamp": datetime(2020, 1, 3)}, + ], + } + + create_events(events_by_person=events, team=self.team) + + odds = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([EventPattern(id="signup"), EventPattern(id="view insights")]), + date_to="2020-04-04", + ), + ) + + assert odds == { + "is_cached": False, + "last_refresh": "2020-01-01T00:00:00Z", + "result": { + "events": [ + { + "event": {"event": "watched video", "elements": [], "properties": {}}, + "success_count": 1, + "failure_count": 1, + "odds_ratio": 1 / 2, + "correlation_type": "failure", + }, + ], + "skewed": False, + }, + } + + def test_event_correlation_is_partitioned_by_team(self): + """ + Ensure there's no crosstalk between teams + + We check this by: + + 1. loading events into team 1 + 2. checking correlation for team 1 + 3. loading events into team 2 + 4. checking correlation for team 1 again, they should be the same + + """ + with freeze_time("2020-01-01"): + self.client.force_login(self.user) + + events = { + "Person 1": [ + {"event": "watched video", "timestamp": datetime(2019, 1, 2)}, + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + ], + "Person 2": [ + {"event": "watched video", "timestamp": datetime(2019, 1, 2)}, + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "view insights", "timestamp": datetime(2020, 1, 3)}, + ], + } + + create_events(events_by_person=events, team=self.team) + + odds_before = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([EventPattern(id="signup"), EventPattern(id="view insights")]), + date_to="2020-04-04", + ), + ) + + other_team = create_team(organization=self.organization) + create_events(events_by_person=events, team=other_team) + + # We need to make sure we clear the cache so we get the same results again + cache.clear() + + odds_after = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([EventPattern(id="signup"), EventPattern(id="view insights")]), + date_to="2020-04-04", + ), + ) + + assert odds_before == odds_after + + def test_event_correlation_endpoint_does_not_include_historical_events(self): + with freeze_time("2020-01-01"): + self.client.force_login(self.user) + + # Add in two people: + # + # Person 1 - a single signup event + # Person 2 - a signup event and a view insights event + # + # Both of them have a "watched video" event but they are before the + # signup event + + events = { + "Person 1": [ + {"event": "watched video", "timestamp": datetime(2019, 1, 2)}, + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + ], + "Person 2": [ + {"event": "watched video", "timestamp": datetime(2019, 1, 2)}, + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "view insights", "timestamp": datetime(2020, 1, 3)}, + ], + } + + create_events(events_by_person=events, team=self.team) + + # We need to make sure we clear the cache other tests that have run + # done interfere with this test + cache.clear() + + odds = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([EventPattern(id="signup"), EventPattern(id="view insights")]), + date_to="2020-04-04", + ), + ) + + assert odds == { + "is_cached": False, + "last_refresh": "2020-01-01T00:00:00Z", + "result": {"events": [], "skewed": False}, + } + + def test_event_correlation_endpoint_does_not_include_funnel_steps(self): + with freeze_time("2020-01-01"): + self.client.force_login(self.user) + + # Add Person1 with only the funnel steps involved + + events = { + "Person 1": [ + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "some waypoint", "timestamp": datetime(2020, 1, 2)}, + {"event": "", "timestamp": datetime(2020, 1, 3)}, + ], + # We need atleast 1 success and failure to return a result + "Person 2": [ + {"event": "signup", "timestamp": datetime(2020, 1, 1)}, + {"event": "some waypoint", "timestamp": datetime(2020, 1, 2)}, + {"event": "view insights", "timestamp": datetime(2020, 1, 3)}, + ], + } + # '' is a weird event name to have, but if it exists, our duty to report it + + create_events(events_by_person=events, team=self.team) + + # We need to make sure we clear the cache other tests that have run + # done interfere with this test + cache.clear() + + odds = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps( + [EventPattern(id="signup"), EventPattern(id="some waypoint"), EventPattern(id="view insights")] + ), + date_to="2020-04-04", + ), + ) + + assert odds == { + "is_cached": False, + "last_refresh": "2020-01-01T00:00:00Z", + "result": { + "events": [ + { + "correlation_type": "failure", + "event": {"event": "", "elements": [], "properties": {}}, + "failure_count": 1, + "odds_ratio": 1 / 4, + "success_count": 0, + } + ], + "skewed": False, + }, + } + + def test_correlation_endpoint_with_properties(self): + self.client.force_login(self.user) + + for i in range(10): + create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Positive"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + for i in range(10, 20): + create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk, properties={"$browser": "Negative"}) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + if i % 2 == 0: + _create_event( + team=self.team, + event="negatively_related", + distinct_id=f"user_{i}", + timestamp="2020-01-03T14:00:00Z", + ) + + # We need to make sure we clear the cache other tests that have run + # done interfere with this test + cache.clear() + + api_response = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([EventPattern(id="user signed up"), EventPattern(id="paid")]), + date_to="2020-01-14", + date_from="2020-01-01", + funnel_correlation_type=FunnelCorrelationType.PROPERTIES, + funnel_correlation_names=json.dumps(["$browser"]), + ), + ) + + self.assertFalse(api_response["result"]["skewed"]) + + result = api_response["result"]["events"] + + odds_ratios = [item.pop("odds_ratio") for item in result] + expected_odds_ratios = [121, 1 / 121] + + for odds, expected_odds in zip(odds_ratios, expected_odds_ratios): + self.assertAlmostEqual(odds, expected_odds) + + self.assertEqual( + result, + [ + { + "event": {"event": "$browser::Positive", "elements": [], "properties": {}}, + "success_count": 10, + "failure_count": 0, + # "odds_ratio": 121.0, + "correlation_type": "success", + }, + { + "event": {"event": "$browser::Negative", "elements": [], "properties": {}}, + "success_count": 0, + "failure_count": 10, + # "odds_ratio": 1 / 121, + "correlation_type": "failure", + }, + ], + ) + + def test_correlation_endpoint_request_with_no_steps_doesnt_fail(self): + """ + This just checks that we get an empty result, this mimics what happens + with other insight endpoints. It's questionable that perhaps this whould + be a 400 instead. + """ + self.client.force_login(self.user) + + with freeze_time("2020-01-01"): + response = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([]), + date_to="2020-01-14", + date_from="2020-01-01", + funnel_correlation_type=FunnelCorrelationType.PROPERTIES, + funnel_correlation_names=json.dumps(["$browser"]), + ), + ) + + assert response == { + "is_cached": False, + "last_refresh": "2020-01-01T00:00:00Z", + "result": {"events": [], "skewed": False}, + } + + def test_funnel_correlation_with_event_properties_autocapture(self): + self.client.force_login(self.user) + + # Need a minimum of 3 hits to get a correlation result + for i in range(3): + create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="$autocapture", + distinct_id=f"user_{i}", + elements=[Element(nth_of_type=1, nth_child=0, tag_name="a", href="/movie")], + timestamp="2020-01-03T14:00:00Z", + properties={"signup_source": "email", "$event_type": "click"}, + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + # Atleast one person that fails, to ensure we get results + create_person(distinct_ids=[f"user_fail"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_fail", timestamp="2020-01-02T14:00:00Z", + ) + + with freeze_time("2020-01-01"): + response = get_funnel_correlation_ok( + client=self.client, + team_id=self.team.pk, + request=FunnelCorrelationRequest( + events=json.dumps([EventPattern(id="user signed up"), EventPattern(id="paid")]), + date_to="2020-01-14", + date_from="2020-01-01", + funnel_correlation_type=FunnelCorrelationType.EVENT_WITH_PROPERTIES, + funnel_correlation_event_names=json.dumps(["$autocapture"]), + ), + ) + + assert response == { + "result": { + "events": [ + { + "success_count": 3, + "failure_count": 0, + "odds_ratio": 8.0, + "correlation_type": "success", + "event": { + "event": '$autocapture::elements_chain::click__~~__a:href="/movie"nth-child="0"nth-of-type="1"', + "properties": {"$event_type": "click"}, + "elements": [ + { + "event": None, + "text": None, + "tag_name": "a", + "attr_class": None, + "href": "/movie", + "attr_id": None, + "nth_child": 0, + "nth_of_type": 1, + "attributes": {}, + "order": 0, + } + ], + }, + }, + ], + "skewed": False, + }, + "last_refresh": "2020-01-01T00:00:00Z", + "is_cached": False, + } + + +@pytest.fixture(autouse=True) +def clear_django_cache(): + cache.clear() + + +def create_team(organization): + return Team.objects.create(name="Test Team", organization=organization) + + +def create_events(events_by_person, team: Team): + """ + Helper for creating specific events for a team. + """ + for distinct_id, events in events_by_person.items(): + create_person(distinct_ids=[distinct_id], team=team) + for event in events: + _create_event( + team=team, + distinct_id=distinct_id, + event=event["event"], + timestamp=event["timestamp"], + properties=event.get("properties", {}), + ) + + +class EventPattern(TypedDict): + id: str + + +@dataclasses.dataclass +class FunnelCorrelationRequest: + # Needs to be json encoded list of `EventPattern`s + events: str + date_to: str + funnel_step: Optional[int] = None + date_from: Optional[str] = None + funnel_correlation_type: Optional[FunnelCorrelationType] = None + # Needs to be json encoded list of `str`s + funnel_correlation_names: Optional[str] = None + funnel_correlation_event_names: Optional[str] = None + + +def get_funnel_correlation(client: Client, team_id: int, request: FunnelCorrelationRequest): + return client.get( + f"/api/projects/{team_id}/insights/funnel/correlation", + data={key: value for key, value in dataclasses.asdict(request).items() if value is not None}, + ) + + +def get_funnel_correlation_ok(client: Client, team_id: int, request: FunnelCorrelationRequest) -> Dict[str, Any]: + response = get_funnel_correlation(client=client, team_id=team_id, request=request) + + assert response.status_code == 200 + return response.json() + + +def create_person(**kwargs): + person = Person.objects.create(**kwargs) + return person + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) diff --git a/ee/clickhouse/views/test/test_clickhouse_funnel_person.py b/ee/clickhouse/views/test/test_clickhouse_funnel_person.py new file mode 100644 index 0000000000000..6c5080b01211a --- /dev/null +++ b/ee/clickhouse/views/test/test_clickhouse_funnel_person.py @@ -0,0 +1,323 @@ +import json +from unittest.mock import patch +from uuid import uuid4 + +from django.core.cache import cache +from rest_framework import status + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return person + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelPerson(ClickhouseTestMixin, APIBaseTest): + def _create_sample_data(self, num, delete=False): + for i in range(num): + person = _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-03 00:00:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-05 00:00:00", + properties={"$browser": "Chrome"}, + ) + if delete: + person.delete() + + def test_basic_format(self): + self._create_sample_data(5) + request_data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "actions": json.dumps([]), + "events": json.dumps( + [{"id": "step one", "order": 0}, {"id": "step two", "order": 1}, {"id": "step three", "order": 2},] + ), + "properties": json.dumps([]), + "funnel_window_days": 14, + "funnel_step": 1, + "filter_test_accounts": "false", + "new_entity": json.dumps([]), + "date_from": "2021-05-01", + "date_to": "2021-05-10", + } + + response = self.client.get("/api/person/funnel/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + first_person = j["results"][0]["people"][0] + self.assertEqual(5, len(j["results"][0]["people"])) + self.assertTrue("id" in first_person and "name" in first_person and "distinct_ids" in first_person) + self.assertEqual(5, j["results"][0]["count"]) + + def test_basic_pagination(self): + cache.clear() + self._create_sample_data(110) + request_data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "actions": json.dumps([]), + "events": json.dumps( + [{"id": "step one", "order": 0}, {"id": "step two", "order": 1}, {"id": "step three", "order": 2},] + ), + "properties": json.dumps([]), + "funnel_window_days": 14, + "funnel_step": 1, + "filter_test_accounts": "false", + "new_entity": json.dumps([]), + "date_from": "2021-05-01", + "date_to": "2021-05-10", + } + + response = self.client.get("/api/person/funnel/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(100, len(people)) + self.assertNotEqual(None, next) + + response = self.client.get(next) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(10, len(people)) + self.assertEqual(None, j["next"]) + + def test_breakdown_basic_pagination(self): + cache.clear() + self._create_sample_data(110) + request_data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "actions": json.dumps([]), + "events": json.dumps( + [{"id": "step one", "order": 0}, {"id": "step two", "order": 1}, {"id": "step three", "order": 2},] + ), + "properties": json.dumps([]), + "funnel_window_days": 14, + "funnel_step": 1, + "filter_test_accounts": "false", + "new_entity": json.dumps([]), + "date_from": "2021-05-01", + "date_to": "2021-05-10", + "breakdown_type": "event", + "breakdown": "$browser", + "funnel_step_breakdown": "Chrome", + } + + response = self.client.get("/api/person/funnel/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(100, len(people)) + + response = self.client.get(next) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(10, len(people)) + self.assertEqual(None, j["next"]) + + @patch("ee.clickhouse.models.person.delete_person") + def test_basic_pagination_with_deleted(self, delete_person_patch): + cache.clear() + self._create_sample_data(110, delete=True) + request_data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "actions": json.dumps([]), + "events": json.dumps( + [{"id": "step one", "order": 0}, {"id": "step two", "order": 1}, {"id": "step three", "order": 2},] + ), + "properties": json.dumps([]), + "funnel_window_days": 14, + "funnel_step": 1, + "filter_test_accounts": "false", + "new_entity": json.dumps([]), + "date_from": "2021-05-01", + "date_to": "2021-05-10", + } + + response = self.client.get("/api/person/funnel/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(0, len(people)) + self.assertIsNone(next) + + def test_breakdowns(self): + request_data = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "actions": json.dumps([]), + "properties": json.dumps([]), + "funnel_step": 1, + "filter_test_accounts": "false", + "new_entity": json.dumps([]), + "events": json.dumps( + [{"id": "sign up", "order": 0}, {"id": "play movie", "order": 1}, {"id": "buy", "order": 2},] + ), + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-08", + "funnel_window_days": 7, + "breakdown": "$browser", + "funnel_step_breakdown": "Chrome", + } + + # event + person1 = _create_person(distinct_ids=["person1"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T12:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T13:00:00Z", + ) + _create_event( + team=self.team, + event="buy", + distinct_id="person1", + properties={"key": "val", "$browser": "Chrome"}, + timestamp="2020-01-01T15:00:00Z", + ) + + person2 = _create_person(distinct_ids=["person2"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, + event="play movie", + distinct_id="person2", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T16:00:00Z", + ) + + person3 = _create_person(distinct_ids=["person3"], team_id=self.team.pk) + _create_event( + team=self.team, + event="sign up", + distinct_id="person3", + properties={"key": "val", "$browser": "Safari"}, + timestamp="2020-01-02T14:00:00Z", + ) + + response = self.client.get("/api/person/funnel/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + + people = j["results"][0]["people"] + self.assertEqual(1, len(people)) + self.assertEqual(None, j["next"]) + + response = self.client.get("/api/person/funnel/", data={**request_data, "funnel_step_breakdown": "Safari"}) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + + people = j["results"][0]["people"] + self.assertEqual(2, len(people)) + self.assertEqual(None, j["next"]) + + +class TestFunnelCorrelationPersons(ClickhouseTestMixin, APIBaseTest): + """ + Tests for /api/projects/:project_id/persons/funnel/correlation/ + """ + + def test_pagination(self): + cache.clear() + + for i in range(10): + _create_person(distinct_ids=[f"user_{i}"], team_id=self.team.pk) + _create_event( + team=self.team, event="user signed up", distinct_id=f"user_{i}", timestamp="2020-01-02T14:00:00Z", + ) + _create_event( + team=self.team, event="positively_related", distinct_id=f"user_{i}", timestamp="2020-01-03T14:00:00Z", + ) + _create_event( + team=self.team, event="paid", distinct_id=f"user_{i}", timestamp="2020-01-04T14:00:00Z", + ) + + request_data = { + "events": json.dumps( + [{"id": "user signed up", "type": "events", "order": 0}, {"id": "paid", "type": "events", "order": 1},] + ), + "insight": INSIGHT_FUNNELS, + "date_from": "2020-01-01", + "date_to": "2020-01-14", + "funnel_correlation_type": "events", + "funnel_correlation_person_converted": "true", + "funnel_correlation_person_limit": 4, + "funnel_correlation_person_entity": json.dumps({"id": "positively_related", "type": "events"}), + } + + response = self.client.get(f"/api/projects/{self.team.pk}/persons/funnel/correlation", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + + first_person = j["results"][0]["people"][0] + self.assertEqual(4, len(j["results"][0]["people"])) + self.assertTrue("id" in first_person and "name" in first_person and "distinct_ids" in first_person) + self.assertEqual(4, j["results"][0]["count"]) + + next = j["next"] + response = self.client.get(next) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(4, len(people)) + self.assertNotEqual(None, next) + + response = self.client.get(next) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(2, len(people)) + self.assertEqual(None, j["next"]) diff --git a/ee/clickhouse/views/test/test_clickhouse_funnel_trends_person.py b/ee/clickhouse/views/test/test_clickhouse_funnel_trends_person.py new file mode 100644 index 0000000000000..4ee2d1146e9e9 --- /dev/null +++ b/ee/clickhouse/views/test/test_clickhouse_funnel_trends_person.py @@ -0,0 +1,73 @@ +import json +from uuid import uuid4 + +from django.core.cache import cache +from rest_framework import status + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import INSIGHT_FUNNELS, FunnelVizType +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestFunnelTrendsPerson(ClickhouseTestMixin, APIBaseTest): + def test_basic_format(self): + user_a = _create_person(distinct_ids=["user a"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-07 19:00:00") + + common_request_data = { + "insight": INSIGHT_FUNNELS, + "funnel_viz_type": FunnelVizType.TRENDS, + "interval": "day", + "date_from": "2021-06-07", + "date_to": "2021-06-13 23:59:59", + "funnel_window_days": 7, + "events": json.dumps( + [{"id": "step one", "order": 0}, {"id": "step two", "order": 1}, {"id": "step three", "order": 2},] + ), + "properties": json.dumps([]), + "funnel_window_days": 7, + "new_entity": json.dumps([]), + } + + # 1 user who dropped off starting 2021-06-07 + response_1 = self.client.get( + "/api/person/funnel/", + data={**common_request_data, "entrance_period_start": "2021-06-07", "drop_off": True,}, + ) + response_1_data = response_1.json() + + self.assertEqual(response_1.status_code, status.HTTP_200_OK) + self.assertEqual([person["uuid"] for person in response_1_data["results"][0]["people"]], [str(user_a.uuid)]) + + # No users converted 2021-06-07 + response_2 = self.client.get( + "/api/person/funnel/", + data={**common_request_data, "entrance_period_start": "2021-06-07 00:00", "drop_off": False,}, + ) + response_2_data = response_2.json() + + self.assertEqual(response_2.status_code, status.HTTP_200_OK) + self.assertEqual([person["uuid"] for person in response_2_data["results"][0]["people"]], []) + + # No users dropped off starting 2021-06-08 + response_3 = self.client.get( + "/api/person/funnel/", + data={**common_request_data, "entrance_period_start": "2021-06-08", "drop_off": True,}, + ) + response_3_data = response_3.json() + + self.assertEqual(response_3.status_code, status.HTTP_200_OK) + self.assertEqual([person["uuid"] for person in response_3_data["results"][0]["people"]], []) diff --git a/ee/clickhouse/views/test/test_clickhouse_insights.py b/ee/clickhouse/views/test/test_clickhouse_insights.py index ee1710b6241a0..610c10b3163e9 100644 --- a/ee/clickhouse/views/test/test_clickhouse_insights.py +++ b/ee/clickhouse/views/test/test_clickhouse_insights.py @@ -1,9 +1,18 @@ +import json +from typing import Any, Dict, List, Union +from unittest.mock import patch from uuid import uuid4 +from rest_framework import status + +from ee.api.test.base import LicensedTestMixin from ee.clickhouse.models.event import create_event from ee.clickhouse.util import ClickhouseTestMixin +from ee.models.explicit_team_membership import ExplicitTeamMembership from posthog.api.test.test_insight import insight_test_factory +from posthog.models.organization import OrganizationMembership from posthog.models.person import Person +from posthog.test.base import APIBaseTest def _create_person(**kwargs): @@ -17,6 +26,545 @@ def _create_event(**kwargs): class ClickhouseTestInsights( - ClickhouseTestMixin, insight_test_factory(_create_event, _create_person) # type: ignore + ClickhouseTestMixin, LicensedTestMixin, insight_test_factory(_create_event, _create_person) # type: ignore ): - pass + # Extra permissioning tests here + def test_insight_trends_allowed_if_project_open_and_org_member(self): + self.organization_membership.level = OrganizationMembership.Level.MEMBER + self.organization_membership.save() + self.team.access_control = False + self.team.save() + response = self.client.get( + f"/api/projects/{self.team.id}/insights/trend/?events={json.dumps([{'id': '$pageview'}])}" + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + def test_insight_trends_forbidden_if_project_private_and_org_member(self): + self.organization_membership.level = OrganizationMembership.Level.MEMBER + self.organization_membership.save() + self.team.access_control = True + self.team.save() + response = self.client.get( + f"/api/projects/{self.team.id}/insights/trend/?events={json.dumps([{'id': '$pageview'}])}" + ) + self.assertDictEqual(self.permission_denied_response("You don't have access to the project."), response.json()) + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_insight_trends_allowed_if_project_private_and_org_member_and_project_member(self): + self.organization_membership.level = OrganizationMembership.Level.MEMBER + self.organization_membership.save() + self.team.access_control = True + self.team.save() + self_team_membership = ExplicitTeamMembership.objects.create( + team=self.team, parent_membership=self.organization_membership, level=ExplicitTeamMembership.Level.MEMBER + ) + response = self.client.get( + f"/api/projects/{self.team.id}/insights/trend/?events={json.dumps([{'id': '$pageview'}])}" + ) + self.assertEqual(response.status_code, status.HTTP_200_OK) + + +class ClickhouseTestFunnelTypes(ClickhouseTestMixin, APIBaseTest): + def test_funnel_unordered_basic_post(self): + _create_person(distinct_ids=["1"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="1") + _create_event(team=self.team, event="step two", distinct_id="1") + + _create_person(distinct_ids=["2"], team=self.team) + _create_event(team=self.team, event="step two", distinct_id="2") + _create_event(team=self.team, event="step one", distinct_id="2") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + ], + "funnel_window_days": 14, + "funnel_order_type": "unordered", + "insight": "funnels", + }, + ).json() + + self.assertEqual(len(response["result"]), 2) + self.assertEqual(response["result"][0]["name"], "step one") + self.assertEqual(response["result"][1]["name"], "step two") + self.assertEqual(response["result"][0]["count"], 2) + self.assertEqual(response["result"][1]["count"], 2) + + def test_funnel_strict_basic_post(self): + _create_person(distinct_ids=["1"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="1") + _create_event(team=self.team, event="step two", distinct_id="1") + + _create_person(distinct_ids=["2"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="2") + _create_event(team=self.team, event="blahh", distinct_id="2") + _create_event(team=self.team, event="step two", distinct_id="2") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + ], + "funnel_window_days": 14, + "funnel_order_type": "strict", + "insight": "funnels", + }, + ).json() + + self.assertEqual(len(response["result"]), 2) + self.assertEqual(response["result"][0]["name"], "step one") + self.assertEqual(response["result"][1]["name"], "step two") + self.assertEqual(response["result"][0]["count"], 2) + self.assertEqual(response["result"][1]["count"], 1) + + def test_funnel_trends_basic_post(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + + # user_one, funnel steps: one, two three + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_two, funnel steps: one, two + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id="user_two", team=self.team, timestamp="2021-05-04 00:00:00") + _create_event(event="step three", distinct_id="user_two", team=self.team, timestamp="2021-05-05 00:00:00") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + {"id": "step three", "type": "events", "order": 2}, + ], + "funnel_window_days": 7, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + "funnel_viz_type": "trends", + }, + ).json() + + self.assertEqual(len(response["result"]), 1) + self.assertEqual(response["result"][0]["count"], 7) + self.assertEqual(response["result"][0]["data"], [100, 100, 0, 0, 0, 0, 0]) + + def test_funnel_trends_unordered_basic_post(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + # user_one, funnel steps: one, two three + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_two, funnel steps: one, two, three + _create_event(event="step three", distinct_id="user_two", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step two", distinct_id="user_two", team=self.team, timestamp="2021-05-04 00:00:00") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + {"id": "step three", "type": "events", "order": 2}, + ], + "funnel_window_days": 7, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + "funnel_viz_type": "trends", + "funnel_order_type": "unordered", + }, + ).json() + + self.assertEqual(len(response["result"]), 1) + self.assertEqual(response["result"][0]["count"], 7) + self.assertEqual(response["result"][0]["data"], [100, 100, 0, 0, 0, 0, 0]) + + def test_funnel_trends_basic_post_backwards_compatibility(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + + # user_one, funnel steps: one, two three + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_two, funnel steps: one, two + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id="user_two", team=self.team, timestamp="2021-05-04 00:00:00") + _create_event(event="step three", distinct_id="user_two", team=self.team, timestamp="2021-05-05 00:00:00") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + {"id": "step three", "type": "events", "order": 2}, + ], + "funnel_window_days": 7, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + "display": "ActionsLineGraph", + }, + ).json() + + self.assertEqual(len(response["result"]), 1) + self.assertEqual(response["result"][0]["count"], 7) + self.assertEqual(response["result"][0]["data"], [100, 100, 0, 0, 0, 0, 0]) + + def test_funnel_trends_strict_basic_post(self): + _create_person(distinct_ids=["user_one"], team=self.team) + _create_person(distinct_ids=["user_two"], team=self.team) + _create_person(distinct_ids=["user_three"], team=self.team) + + # user_one, funnel steps: one, two three + _create_event(event="step one", distinct_id="user_one", team=self.team, timestamp="2021-05-01 01:00:00") + _create_event(event="step two", distinct_id="user_one", team=self.team, timestamp="2021-05-03 00:00:00") + _create_event(event="step three", distinct_id="user_one", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_two, funnel steps: one, two + _create_event(event="step one", distinct_id="user_two", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id="user_two", team=self.team, timestamp="2021-05-04 00:00:00") + _create_event(event="blah", distinct_id="user_two", team=self.team, timestamp="2021-05-04 02:00:00") + _create_event(event="step three", distinct_id="user_two", team=self.team, timestamp="2021-05-05 00:00:00") + + # user_three, funnel steps: one, two, three + _create_event(event="step one", distinct_id="user_three", team=self.team, timestamp="2021-05-02 00:00:00") + _create_event(event="step two", distinct_id="user_three", team=self.team, timestamp="2021-05-04 00:00:00") + _create_event(event="step three", distinct_id="user_three", team=self.team, timestamp="2021-05-05 00:00:00") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + {"id": "step three", "type": "events", "order": 2}, + ], + "funnel_window_days": 7, + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 23:59:59", + "funnel_viz_type": "trends", + "funnel_order_type": "strict", + }, + ).json() + + self.assertEqual(len(response["result"]), 1) + self.assertEqual(response["result"][0]["count"], 7) + self.assertEqual(response["result"][0]["data"], [100, 50, 0, 0, 0, 0, 0]) + + def test_funnel_time_to_convert_auto_bins(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="blah", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:30:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "insight": "funnels", + "funnel_viz_type": "time_to_convert", + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + }, + ) + + self.assertEqual(response.status_code, 200) + response_data = response.json() + response_data.pop("last_refresh") + self.assertEqual( + response_data, + { + "is_cached": False, + "result": { + "bins": [[2220.0, 2], [29080.0, 0], [55940.0, 0], [82800.0, 1]], + "average_conversion_time": 29540.0, + }, + }, + ) + + def test_funnel_time_to_convert_auto_bins_strict(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "insight": "funnels", + "funnel_viz_type": "time_to_convert", + "funnel_order_type": "strict", + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + }, + ) + + self.assertEqual(response.status_code, 200) + response_data = response.json() + response_data.pop("last_refresh") + self.assertEqual( + response_data, + { + "is_cached": False, + "result": { + "bins": [[2220.0, 2], [29080.0, 0], [55940.0, 0], [82800.0, 1]], + "average_conversion_time": 29540.0, + }, + }, + ) + + def test_funnel_time_to_convert_auto_bins_unordered(self): + _create_person(distinct_ids=["user a"], team=self.team) + _create_person(distinct_ids=["user b"], team=self.team) + _create_person(distinct_ids=["user c"], team=self.team) + + _create_event(event="step one", distinct_id="user a", team=self.team, timestamp="2021-06-08 18:00:00") + _create_event(event="step two", distinct_id="user a", team=self.team, timestamp="2021-06-08 19:00:00") + # Converted from 0 to 1 in 3600 s + _create_event(event="step three", distinct_id="user a", team=self.team, timestamp="2021-06-08 21:00:00") + + _create_event(event="step two", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:00:00") + _create_event(event="step one", distinct_id="user b", team=self.team, timestamp="2021-06-09 13:37:00") + # Converted from 0 to 1 in 2200 s + + _create_event(event="step one", distinct_id="user c", team=self.team, timestamp="2021-06-11 07:00:00") + _create_event(event="step two", distinct_id="user c", team=self.team, timestamp="2021-06-12 06:00:00") + # Converted from 0 to 1 in 82_800 s + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "insight": "funnels", + "funnel_viz_type": "time_to_convert", + "funnel_order_type": "unordered", + "interval": "day", + "date_from": "2021-06-07 00:00:00", + "date_to": "2021-06-13 23:59:59", + "funnel_to_step": 1, + "funnel_window_days": 7, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + }, + ) + + self.assertEqual(response.status_code, 200) + response_data = response.json() + response_data.pop("last_refresh") + self.assertEqual( + response_data, + { + "is_cached": False, + "result": { + "bins": [[2220.0, 2], [29080.0, 0], [55940.0, 0], [82800.0, 1]], + "average_conversion_time": 29540.0, + }, + }, + ) + + def test_funnel_invalid_action_handled(self): + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + {"actions": [{"id": 666, "type": "actions", "order": 0},]}, + ) + + self.assertEqual(response.status_code, 400) + self.assertEqual(response.json(), self.validation_error_response("Action ID 666 does not exist!")) + + def test_funnel_basic_exclusions(self): + _create_person(distinct_ids=["1"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="1") + _create_event(team=self.team, event="step x", distinct_id="1") + _create_event(team=self.team, event="step two", distinct_id="1") + + _create_person(distinct_ids=["2"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="2") + _create_event(team=self.team, event="step two", distinct_id="2") + + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + ], + "exclusions": [{"id": "step x", "type": "events", "funnel_from_step": 0, "funnel_to_step": 1},], + "funnel_window_days": 14, + "insight": "funnels", + }, + ).json() + + self.assertEqual(len(response["result"]), 2) + self.assertEqual(response["result"][0]["name"], "step one") + self.assertEqual(response["result"][1]["name"], "step two") + self.assertEqual(response["result"][0]["count"], 1) + self.assertEqual(response["result"][1]["count"], 1) + + def test_funnel_invalid_exclusions(self): + _create_person(distinct_ids=["1"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="1") + _create_event(team=self.team, event="step x", distinct_id="1") + _create_event(team=self.team, event="step two", distinct_id="1") + + _create_person(distinct_ids=["2"], team=self.team) + _create_event(team=self.team, event="step one", distinct_id="2") + _create_event(team=self.team, event="step two", distinct_id="2") + + for exclusion_id, exclusion_from_step, exclusion_to_step, error in [ + ("step one", 0, 1, True), + ("step two", 0, 1, True), + ("step two", 0, 2, True), + ("step one", 0, 2, True), + ("step three", 0, 2, True), + ("step three", 0, 1, False), + ]: + response = self.client.post( + f"/api/projects/{self.team.id}/insights/funnel/", + { + "events": [ + {"id": "step one", "type": "events", "order": 0}, + {"id": "step two", "type": "events", "order": 1}, + {"id": "step three", "type": "events", "order": 2}, + ], + "exclusions": [ + { + "id": exclusion_id, + "type": "events", + "funnel_from_step": exclusion_from_step, + "funnel_to_step": exclusion_to_step, + }, + ], + "funnel_window_days": 14, + "insight": "funnels", + }, + ) + + if error: + self.assertEqual(response.status_code, 400) + self.assertEqual( + response.json(), self.validation_error_response("Exclusion event can't be the same as funnel step") + ) + else: + self.assertEqual(response.status_code, 200) + + @patch("ee.clickhouse.views.insights.ClickhouseInsightsViewSet.calculate_funnel") + def test_that_multi_property_breakdown_is_not_breaking(self, mcf): + + test_cases: List[Dict[str, Any]] = [ + # single property + {"breakdown": "$browser", "funnel result": ["Chrome", "Safari"], "expected": ["Chrome", "Safari"]}, + # single property client, multi property query result + {"breakdown": "$browser", "funnel result": [["Chrome"], ["Safari"]], "expected": ["Chrome", "Safari"]}, + # multi property client, multi property query result + { + "breakdown": ["$browser"], + "funnel result": [["Chrome"], ["Safari"]], + "expected": [["Chrome"], ["Safari"]], + }, + ] + + for test_case in test_cases: + + filter_with_breakdown = { + "insight": "FUNNELS", + "date_from": "-14d", + "actions": [], + "events": [ + {"id": "$pageview", "name": "$pageview", "type": "events", "order": 0}, + {"id": "$pageview", "type": "events", "order": 1, "name": "$pageview"}, + ], + "display": "FunnelViz", + "interval": "day", + "properties": [], + "funnel_viz_type": "steps", + "exclusions": [], + "breakdown": test_case["breakdown"], + "breakdown_type": "event", + "funnel_from_step": 0, + "funnel_to_step": 1, + } + + mcf.return_value = {"result": [[self.as_result(b), self.as_result(b)] for b in test_case["funnel result"]]} + + response = self.client.post(f"/api/projects/{self.team.id}/insights/funnel", filter_with_breakdown) + self.assertEqual(200, response.status_code) + + response_data = response.json() + + result = response_data["result"] + + # input events have chrome and safari so results is an array with two arrays as its contents + for i in range(0, 2): + for funnel_data in result[i]: + self.assertIsInstance(funnel_data["name"], str) + self.assertEqual(test_case["expected"][i], funnel_data["breakdown"]) + self.assertEqual(test_case["expected"][i], funnel_data["breakdown_value"]) + + @staticmethod + def as_result(breakdown_properties: Union[str, List[str]]) -> Dict[str, Any]: + return { + "action_id": "$pageview", + "name": "$pageview", + "custom_name": None, + "order": 0, + "people": ["a uuid"], + "count": 1, + "type": "events", + "average_conversion_time": None, + "median_conversion_time": None, + "breakdown": breakdown_properties, + "breakdown_value": breakdown_properties, + } diff --git a/ee/clickhouse/views/test/test_clickhouse_path_person.py b/ee/clickhouse/views/test/test_clickhouse_path_person.py new file mode 100644 index 0000000000000..24076afe57a38 --- /dev/null +++ b/ee/clickhouse/views/test/test_clickhouse_path_person.py @@ -0,0 +1,218 @@ +import json +from unittest.mock import patch +from uuid import uuid4 + +from django.core.cache import cache +from rest_framework import status + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import FUNNEL_PATH_AFTER_STEP, INSIGHT_FUNNELS, INSIGHT_PATHS +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return person + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestPathPerson(ClickhouseTestMixin, APIBaseTest): + def _create_sample_data(self, num, delete=False): + for i in range(num): + person = _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={"$browser": "Chrome"}, + ) + if i % 2 == 0: + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:10:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:20:00", + properties={"$browser": "Chrome"}, + ) + if delete: + person.delete() + + def test_basic_format(self): + self._create_sample_data(5) + request_data = { + "insight": INSIGHT_PATHS, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-10", + } + + response = self.client.get("/api/person/path/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + first_person = j["results"][0]["people"][0] + self.assertEqual(5, len(j["results"][0]["people"])) + self.assertTrue("id" in first_person and "name" in first_person and "distinct_ids" in first_person) + self.assertEqual(5, j["results"][0]["count"]) + + def test_basic_format_with_path_start_key_constraints(self): + self._create_sample_data(5) + request_data = { + "insight": INSIGHT_PATHS, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-10", + "path_start_key": "2_step two", + } + + response = self.client.get("/api/person/path/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + first_person = j["results"][0]["people"][0] + self.assertEqual(3, len(j["results"][0]["people"])) + self.assertTrue("id" in first_person and "name" in first_person and "distinct_ids" in first_person) + self.assertEqual(3, j["results"][0]["count"]) + + def test_basic_format_with_start_point_constraints(self): + self._create_sample_data(7) + request_data = { + "insight": INSIGHT_PATHS, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-10", + "path_start_key": "1_step two", + "start_point": "step two", + } + + response = self.client.get("/api/person/path/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + first_person = j["results"][0]["people"][0] + self.assertEqual(4, len(j["results"][0]["people"])) + self.assertTrue("id" in first_person and "name" in first_person and "distinct_ids" in first_person) + self.assertEqual(4, j["results"][0]["count"]) + + def test_basic_pagination(self): + self._create_sample_data(20) + request_data = { + "insight": INSIGHT_PATHS, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-10", + "limit": 15, + } + + response = self.client.get("/api/person/path/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + + self.assertEqual(15, len(people)) + self.assertNotEqual(None, next) + + response = self.client.get(next) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(5, len(people)) + self.assertEqual(None, j["next"]) + + @patch("ee.clickhouse.models.person.delete_person") + def test_basic_pagination_with_deleted(self, delete_person_patch): + cache.clear() + self._create_sample_data(110, delete=True) + request_data = { + "insight": INSIGHT_PATHS, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-10", + } + + response = self.client.get("/api/person/path/", data=request_data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + j = response.json() + people = j["results"][0]["people"] + next = j["next"] + self.assertEqual(0, len(people)) + self.assertIsNone(next) + + def test_basic_format_with_funnel_path_post(self): + self._create_sample_data(7) + request_data = { + "insight": INSIGHT_PATHS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-07", + "path_start_key": "1_step two", + "path_end_key": "2_step three", + } + + funnel_filter = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + + post_response = self.client.post("/api/person/path/", data={**request_data, "funnel_filter": funnel_filter}) + self.assertEqual(post_response.status_code, status.HTTP_200_OK) + post_j = post_response.json() + self.assertEqual(4, len(post_j["results"][0]["people"])) + + def test_basic_format_with_funnel_path_get(self): + self._create_sample_data(7) + request_data = { + "insight": INSIGHT_PATHS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-07", + "path_start_key": "1_step two", + "path_end_key": "2_step three", + } + + funnel_filter = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + + get_response = self.client.get( + "/api/person/path/", data={**request_data, "funnel_filter": json.dumps(funnel_filter)} + ) + self.assertEqual(get_response.status_code, status.HTTP_200_OK) + get_j = get_response.json() + self.assertEqual(4, len(get_j["results"][0]["people"])) diff --git a/ee/clickhouse/views/test/test_clickhouse_paths.py b/ee/clickhouse/views/test/test_clickhouse_paths.py new file mode 100644 index 0000000000000..46eeddc15a642 --- /dev/null +++ b/ee/clickhouse/views/test/test_clickhouse_paths.py @@ -0,0 +1,238 @@ +import json +from uuid import uuid4 + +from django.core.cache import cache +from rest_framework import status + +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.constants import FUNNEL_PATH_AFTER_STEP, INSIGHT_FUNNELS, INSIGHT_PATHS +from posthog.models.person import Person +from posthog.test.base import APIBaseTest + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid, uuid=person.uuid) + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class TestClickhousePaths(ClickhouseTestMixin, APIBaseTest): + def _create_sample_data(self, num, delete=False): + for i in range(num): + person = _create_person(distinct_ids=[f"user_{i}"], team=self.team) + _create_event( + event="step one", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:00:00", + properties={"$browser": "Chrome"}, + ) + if i % 2 == 0: + _create_event( + event="step two", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:10:00", + properties={"$browser": "Chrome"}, + ) + _create_event( + event="step three", + distinct_id=f"user_{i}", + team=self.team, + timestamp="2021-05-01 00:20:00", + properties={"$browser": "Chrome"}, + ) + if delete: + person.delete() + + def test_insight_paths_basic(self): + _create_person(team=self.team, distinct_ids=["person_1"]) + _create_event( + properties={"$current_url": "/"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + + response = self.client.get(f"/api/projects/{self.team.id}/insights/path",).json() + self.assertEqual(len(response["result"]), 1) + + def test_insight_paths_basic_exclusions(self): + _create_person(team=self.team, distinct_ids=["person_1"]) + _create_event( + distinct_id="person_1", event="first event", team=self.team, + ) + _create_event( + distinct_id="person_1", event="second event", team=self.team, + ) + _create_event( + distinct_id="person_1", event="third event", team=self.team, + ) + + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", data={"exclude_events": '["second event"]'} + ).json() + self.assertEqual(len(response["result"]), 1) + + def test_backwards_compatible_path_types(self): + + _create_person(team=self.team, distinct_ids=["person_1"]) + _create_event( + properties={"$current_url": "/"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/something else"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$screen_name": "/screen1"}, distinct_id="person_1", event="$screen", team=self.team, + ) + _create_event( + distinct_id="person_1", event="custom1", team=self.team, + ) + _create_event( + distinct_id="person_1", event="custom2", team=self.team, + ) + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", data={"path_type": "$pageview", "insight": "PATHS",} + ).json() + self.assertEqual(len(response["result"]), 2) + + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", data={"path_type": "custom_event", "insight": "PATHS"} + ).json() + self.assertEqual(len(response["result"]), 1) + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", data={"path_type": "$screen", "insight": "PATHS"} + ).json() + self.assertEqual(len(response["result"]), 0) + + def test_backwards_compatible_start_point(self): + + _create_person(team=self.team, distinct_ids=["person_1"]) + _create_event( + properties={"$current_url": "/"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/something else"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$screen_name": "/screen1"}, distinct_id="person_1", event="$screen", team=self.team, + ) + _create_event( + properties={"$screen_name": "/screen2"}, distinct_id="person_1", event="$screen", team=self.team, + ) + _create_event( + distinct_id="person_1", event="custom1", team=self.team, + ) + _create_event( + distinct_id="person_1", event="custom2", team=self.team, + ) + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", + data={"path_type": "$pageview", "insight": "PATHS", "start_point": "/about",}, + ).json() + self.assertEqual(len(response["result"]), 1) + + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", + data={"path_type": "custom_event", "insight": "PATHS", "start_point": "custom2",}, + ).json() + self.assertEqual(len(response["result"]), 0) + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", + data={"path_type": "$screen", "insight": "PATHS", "start_point": "/screen1",}, + ).json() + self.assertEqual(len(response["result"]), 1) + + def test_path_groupings(self): + _create_person(team=self.team, distinct_ids=["person_1"]) + _create_event( + properties={"$current_url": "/about_1"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about_2"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/something else"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about3"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about4"}, distinct_id="person_1", event="$pageview", team=self.team, + ) + + _create_person(team=self.team, distinct_ids=["person_2"]) + _create_event( + properties={"$current_url": "/about_1"}, distinct_id="person_2", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about_2"}, distinct_id="person_2", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/something else"}, distinct_id="person_2", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about3"}, distinct_id="person_2", event="$pageview", team=self.team, + ) + _create_event( + properties={"$current_url": "/about4"}, distinct_id="person_2", event="$pageview", team=self.team, + ) + + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", + data={"insight": "PATHS", "path_groupings": json.dumps(["/about*"])}, + ).json() + self.assertEqual(len(response["result"]), 2) + + response = self.client.get( + f"/api/projects/{self.team.id}/insights/path", + data={"insight": "PATHS", "path_groupings": json.dumps(["/about_*"])}, + ).json() + self.assertEqual(len(response["result"]), 3) + + def test_funnel_path_post(self): + self._create_sample_data(7) + request_data = { + "insight": INSIGHT_PATHS, + "funnel_paths": FUNNEL_PATH_AFTER_STEP, + "filter_test_accounts": "false", + "date_from": "2021-05-01", + "date_to": "2021-05-07", + } + + funnel_filter = { + "insight": INSIGHT_FUNNELS, + "interval": "day", + "date_from": "2021-05-01 00:00:00", + "date_to": "2021-05-07 00:00:00", + "funnel_window_interval": 7, + "funnel_window_interval_unit": "day", + "funnel_step": 2, + "events": [ + {"id": "step one", "order": 0}, + {"id": "step two", "order": 1}, + {"id": "step three", "order": 2}, + ], + } + + post_response = self.client.post( + f"/api/projects/{self.team.id}/insights/path/", data={**request_data, "funnel_filter": funnel_filter} + ) + self.assertEqual(post_response.status_code, status.HTTP_200_OK) + post_j = post_response.json() + self.assertEqual( + post_j["result"], + [{"source": "1_step two", "target": "2_step three", "value": 4, "average_conversion_time": 600000.0}], + ) diff --git a/ee/clickhouse/views/test/test_clickhouse_person.py b/ee/clickhouse/views/test/test_clickhouse_person.py index dac376861d835..3e15ffa21af04 100644 --- a/ee/clickhouse/views/test/test_clickhouse_person.py +++ b/ee/clickhouse/views/test/test_clickhouse_person.py @@ -1,10 +1,12 @@ from uuid import uuid4 +from rest_framework import status + from ee.clickhouse.client import sync_execute from ee.clickhouse.models.event import create_event from ee.clickhouse.util import ClickhouseTestMixin -from posthog.api.test.test_person import test_person_factory -from posthog.models import Action, ActionStep, Event, Person +from posthog.api.test.test_person import factory_test_person +from posthog.models import Event, Person def _create_event(**kwargs): @@ -12,12 +14,8 @@ def _create_event(**kwargs): return Event(pk=create_event(**kwargs)) -def _get_events(): - return sync_execute("select * from events") - - -def _get_people(): - return [Person(p) for p in sync_execute("select * from person")] +def _get_events(team_id): + return sync_execute("SELECT * FROM events WHERE team_id = %(team_id)s", {"team_id": team_id}) def _create_person(**kwargs): @@ -25,6 +23,6 @@ def _create_person(**kwargs): class ClickhouseTestPersonApi( - ClickhouseTestMixin, test_person_factory(_create_event, _create_person, _get_events, Person.objects.all) # type: ignore + ClickhouseTestMixin, factory_test_person(_create_event, _create_person, _get_events) # type: ignore ): pass diff --git a/ee/clickhouse/views/test/test_clickhouse_session_recordings.py b/ee/clickhouse/views/test/test_clickhouse_session_recordings.py new file mode 100644 index 0000000000000..0e77dfdee4571 --- /dev/null +++ b/ee/clickhouse/views/test/test_clickhouse_session_recordings.py @@ -0,0 +1,15 @@ +from uuid import uuid4 + +from ee.clickhouse.models.session_recording_event import create_session_recording_event +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.api.test.test_session_recordings import factory_test_session_recordings_api + + +def _create_session_recording_event(**kwargs): + create_session_recording_event( + uuid=uuid4(), **kwargs, + ) + + +class ClickhouseTestSessionRecordingsAPI(ClickhouseTestMixin, factory_test_session_recordings_api(_create_session_recording_event)): # type: ignore + pass diff --git a/ee/conftest.py b/ee/conftest.py new file mode 100644 index 0000000000000..e3d6611abce0e --- /dev/null +++ b/ee/conftest.py @@ -0,0 +1,143 @@ +import pytest +from infi.clickhouse_orm import Database + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.sql.dead_letter_queue import ( + DEAD_LETTER_QUEUE_TABLE_MV_SQL, + KAFKA_DEAD_LETTER_QUEUE_TABLE_SQL, + TRUNCATE_DEAD_LETTER_QUEUE_TABLE_MV_SQL, +) +from posthog.settings import ( + CLICKHOUSE_DATABASE, + CLICKHOUSE_HTTP_URL, + CLICKHOUSE_PASSWORD, + CLICKHOUSE_USER, + CLICKHOUSE_VERIFY, +) +from posthog.test.base import TestMixin +from posthog.utils import is_clickhouse_enabled + + +def create_clickhouse_tables(num_tables: int): + # Reset clickhouse tables to default before running test + # Mostly so that test runs locally work correctly + from ee.clickhouse.sql.cohort import CREATE_COHORTPEOPLE_TABLE_SQL + from ee.clickhouse.sql.dead_letter_queue import DEAD_LETTER_QUEUE_TABLE_SQL + from ee.clickhouse.sql.events import EVENTS_TABLE_SQL + from ee.clickhouse.sql.groups import GROUPS_TABLE_SQL + from ee.clickhouse.sql.person import ( + PERSON_STATIC_COHORT_TABLE_SQL, + PERSONS_DISTINCT_ID_TABLE_SQL, + PERSONS_TABLE_SQL, + ) + from ee.clickhouse.sql.plugin_log_entries import PLUGIN_LOG_ENTRIES_TABLE_SQL + from ee.clickhouse.sql.session_recording_events import SESSION_RECORDING_EVENTS_TABLE_SQL + + # REMEMBER TO ADD ANY NEW CLICKHOUSE TABLES TO THIS ARRAY! + TABLES_TO_CREATE_DROP = [ + EVENTS_TABLE_SQL, + PERSONS_TABLE_SQL, + PERSONS_DISTINCT_ID_TABLE_SQL, + PERSON_STATIC_COHORT_TABLE_SQL, + SESSION_RECORDING_EVENTS_TABLE_SQL, + PLUGIN_LOG_ENTRIES_TABLE_SQL, + CREATE_COHORTPEOPLE_TABLE_SQL, + KAFKA_DEAD_LETTER_QUEUE_TABLE_SQL, + DEAD_LETTER_QUEUE_TABLE_SQL, + DEAD_LETTER_QUEUE_TABLE_MV_SQL, + GROUPS_TABLE_SQL, + ] + + if num_tables == len(TABLES_TO_CREATE_DROP): + return + + for item in TABLES_TO_CREATE_DROP: + sync_execute(item) + + +def reset_clickhouse_tables(): + # Reset clickhouse tables to default before running test + # Mostly so that test runs locally work correctly + from ee.clickhouse.sql.cohort import TRUNCATE_COHORTPEOPLE_TABLE_SQL + from ee.clickhouse.sql.dead_letter_queue import TRUNCATE_DEAD_LETTER_QUEUE_TABLE_SQL + from ee.clickhouse.sql.events import TRUNCATE_EVENTS_TABLE_SQL + from ee.clickhouse.sql.groups import TRUNCATE_GROUPS_TABLE_SQL + from ee.clickhouse.sql.person import ( + TRUNCATE_PERSON_DISTINCT_ID_TABLE_SQL, + TRUNCATE_PERSON_STATIC_COHORT_TABLE_SQL, + TRUNCATE_PERSON_TABLE_SQL, + ) + from ee.clickhouse.sql.plugin_log_entries import TRUNCATE_PLUGIN_LOG_ENTRIES_TABLE_SQL + from ee.clickhouse.sql.session_recording_events import TRUNCATE_SESSION_RECORDING_EVENTS_TABLE_SQL + + # REMEMBER TO ADD ANY NEW CLICKHOUSE TABLES TO THIS ARRAY! + TABLES_TO_CREATE_DROP = [ + TRUNCATE_EVENTS_TABLE_SQL, + TRUNCATE_PERSON_TABLE_SQL, + TRUNCATE_PERSON_DISTINCT_ID_TABLE_SQL, + TRUNCATE_PERSON_STATIC_COHORT_TABLE_SQL, + TRUNCATE_SESSION_RECORDING_EVENTS_TABLE_SQL, + TRUNCATE_PLUGIN_LOG_ENTRIES_TABLE_SQL, + TRUNCATE_COHORTPEOPLE_TABLE_SQL, + TRUNCATE_DEAD_LETTER_QUEUE_TABLE_SQL, + TRUNCATE_DEAD_LETTER_QUEUE_TABLE_MV_SQL, + TRUNCATE_GROUPS_TABLE_SQL, + ] + + for item in TABLES_TO_CREATE_DROP: + sync_execute(item) + + +if is_clickhouse_enabled(): + + @pytest.fixture(scope="package") + def django_db_setup(django_db_setup, django_db_keepdb): + database = Database( + CLICKHOUSE_DATABASE, + db_url=CLICKHOUSE_HTTP_URL, + username=CLICKHOUSE_USER, + password=CLICKHOUSE_PASSWORD, + verify_ssl_cert=CLICKHOUSE_VERIFY, + ) + + if not django_db_keepdb: + try: + database.drop_database() + except: + pass + + database.create_database() # Create database if it doesn't exist + table_count = sync_execute( + "SELECT count() FROM system.tables WHERE database = %(database)s", {"database": CLICKHOUSE_DATABASE} + )[0][0] + create_clickhouse_tables(table_count) + + yield + + if django_db_keepdb: + reset_clickhouse_tables() + else: + try: + database.drop_database() + except: + pass + + +@pytest.fixture +def base_test_mixin_fixture(): + kls = TestMixin() + kls.setUp() + kls.setUpTestData() + + return kls + + +@pytest.fixture +def team(base_test_mixin_fixture): + return base_test_mixin_fixture.team + + +# :TRICKY: Integrate syrupy with unittest test cases +@pytest.fixture +def unittest_snapshot(request, snapshot): + request.cls.snapshot = snapshot diff --git a/ee/docker-compose.ch.arm64.yml b/ee/docker-compose.ch.arm64.yml new file mode 100644 index 0000000000000..6c719d04a4cf6 --- /dev/null +++ b/ee/docker-compose.ch.arm64.yml @@ -0,0 +1,104 @@ +version: '3' + +services: + db: + image: postgres:12-alpine + environment: + POSTGRES_USER: posthog + POSTGRES_DB: posthog + POSTGRES_PASSWORD: posthog + ports: + - '5432:5432' + redis: + image: 'redis:alpine' + ports: + - '6379:6379' + clickhouse: + # Build with: yarn arm64:build:clickhouse + image: clickhouse-dev-arm64:latest + depends_on: + - kafka + - zookeeper + ports: + - '8123:8123' + - '9000:9000' + - '9440:9440' + - '9009:9009' + volumes: + - ./idl:/idl + - ../docker/clickhouse/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d + - ../docker/clickhouse/config.xml:/etc/clickhouse-server/config.xml + - ../docker/clickhouse/users.xml:/etc/clickhouse-server/users.xml + zookeeper: + image: zookeeper + restart: always + kafka: + image: wurstmeister/kafka + depends_on: + - zookeeper + ports: + - '9092:9092' + environment: + KAFKA_ADVERTISED_HOST_NAME: kafka + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + worker: &worker + build: + context: ../ + dockerfile: dev.Dockerfile + command: ./bin/docker-worker-celery --with-scheduler + volumes: + - ..:/code + environment: + DATABASE_URL: 'postgres://posthog:posthog@db:5432/posthog' + CLICKHOUSE_HOST: 'clickhouse' + CLICKHOUSE_DATABASE: 'posthog' + CLICKHOUSE_SECURE: 'false' + CLICKHOUSE_VERIFY: 'false' + KAFKA_URL: 'kafka://kafka' + REDIS_URL: 'redis://redis:6379/' + SECRET_KEY: 'alsdfjiosdajfklalsdjkf' + DEBUG: 'true' + PRIMARY_DB: 'clickhouse' + PGHOST: db + PGUSER: posthog + PGPASSWORD: posthog + depends_on: + - db + - redis + - clickhouse + - kafka + links: + - db:db + - redis:redis + - clickhouse:clickhouse + - kafka:kafka + web: + <<: *worker + command: '${CH_WEB_SCRIPT:-./ee/bin/docker-ch-dev-web}' + ports: + - '8000:8000' + - '8234:8234' + plugins: + build: + context: ../ + dockerfile: dev.Dockerfile + command: ./bin/plugin-server --no-restart-loop + volumes: + - ..:/code + restart: on-failure + environment: + DATABASE_URL: 'postgres://posthog:posthog@db:5432/posthog' + KAFKA_ENABLED: 'true' + KAFKA_HOSTS: 'kafka:9092' + REDIS_URL: 'redis://redis:6379/' + CLICKHOUSE_HOST: 'clickhouse' + depends_on: + - db + - redis + - clickhouse + - kafka + links: + - db:db + - redis:redis + - clickhouse:clickhouse + - kafka:kafka diff --git a/ee/docker-compose.ch.test.yml b/ee/docker-compose.ch.test.yml new file mode 100644 index 0000000000000..c10b601113e4d --- /dev/null +++ b/ee/docker-compose.ch.test.yml @@ -0,0 +1,31 @@ +version: '3' + +services: + test: + build: + context: ../ + dockerfile: dev.Dockerfile + command: ./ee/bin/docker-ch-test + volumes: + - ..:/code + environment: + DATABASE_URL: 'postgres://posthog:posthog@db:5432/posthog' + CLICKHOUSE_HOST: 'clickhouse' + CLICKHOUSE_SECURE: 'false' + CLICKHOUSE_VERIFY: 'false' + KAFKA_URL: 'kafka://kafka' + REDIS_URL: 'redis://redis:6379/' + SECRET_KEY: 'alsdfjiosdajfklalsdjkf' + DEBUG: 'true' + PRIMARY_DB: 'clickhouse' + TEST: 'true' + depends_on: + - db + - redis + - clickhouse + - kafka + links: + - db:db + - redis:redis + - clickhouse:clickhouse + - kafka:kafka diff --git a/ee/docker-compose.ch.yml b/ee/docker-compose.ch.yml index 32d58f9cea635..552549e3cdfe1 100644 --- a/ee/docker-compose.ch.yml +++ b/ee/docker-compose.ch.yml @@ -2,86 +2,103 @@ version: '3' services: db: - image: postgres:11-alpine + image: postgres:12-alpine environment: POSTGRES_USER: posthog POSTGRES_DB: posthog POSTGRES_PASSWORD: posthog ports: - - '5439:5432' + - '5432:5432' redis: image: 'redis:alpine' ports: - '6379:6379' - web: + clickhouse: + # KEEP CLICKHOUSE-SERVER VERSION IN SYNC WITH + # https://github.com/PostHog/charts-clickhouse/blob/main/charts/posthog/templates/clickhouse_instance.yaml#L88 + image: yandex/clickhouse-server:21.6.5 + depends_on: + - kafka + - zookeeper + ports: + - '8123:8123' + - '9000:9000' + - '9440:9440' + - '9009:9009' + volumes: + - ./idl:/idl + - ../docker/clickhouse/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d + - ../docker/clickhouse/config.xml:/etc/clickhouse-server/config.xml + - ../docker/clickhouse/users.xml:/etc/clickhouse-server/users.xml + zookeeper: + image: wurstmeister/zookeeper + kafka: + image: wurstmeister/kafka + depends_on: + - zookeeper + ports: + - '9092:9092' + environment: + KAFKA_ADVERTISED_HOST_NAME: kafka + KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + worker: &worker build: context: ../ dockerfile: dev.Dockerfile - command: ./bin/docker-ch-dev-web + command: ./bin/docker-worker-celery --with-scheduler volumes: - ..:/code - ports: - - '8000:8000' - - '8234:8234' environment: - IS_DOCKER: 'true' DATABASE_URL: 'postgres://posthog:posthog@db:5432/posthog' CLICKHOUSE_HOST: 'clickhouse' CLICKHOUSE_DATABASE: 'posthog' - CLICKHOUSE_SECURE: 'False' - CLICKHOUSE_VERIFY: 'False' + CLICKHOUSE_SECURE: 'false' + CLICKHOUSE_VERIFY: 'false' KAFKA_URL: 'kafka://kafka' REDIS_URL: 'redis://redis:6379/' SECRET_KEY: 'alsdfjiosdajfklalsdjkf' DEBUG: 'true' PRIMARY_DB: 'clickhouse' + PGHOST: db + PGUSER: posthog + PGPASSWORD: posthog depends_on: - db - redis + - clickhouse + - kafka links: - db:db - redis:redis - worker: + - clickhouse:clickhouse + - kafka:kafka + web: + <<: *worker + command: '${CH_WEB_SCRIPT:-./ee/bin/docker-ch-dev-web}' + ports: + - '8000:8000' + - '8234:8234' + plugins: build: context: ../ dockerfile: dev.Dockerfile - command: ./bin/docker-worker + command: ./bin/plugin-server --no-restart-loop volumes: - ..:/code + restart: on-failure environment: - IS_DOCKER: 'true' DATABASE_URL: 'postgres://posthog:posthog@db:5432/posthog' - CLICKHOUSE_HOST: 'clickhouse' - CLICKHOUSE_DATABASE: 'posthog' - CLICKHOUSE_SECURE: 'False' - CLICKHOUSE_VERIFY: 'False' - KAFKA_URL: 'kafka://kafka' + KAFKA_ENABLED: 'true' + KAFKA_HOSTS: 'kafka:9092' REDIS_URL: 'redis://redis:6379/' - SECRET_KEY: 'asdflaisdjkfalsdkjf' - DEBUG: 'true' - PRIMARY_DB: 'clickhouse' + CLICKHOUSE_HOST: 'clickhouse' depends_on: - db - redis + - clickhouse + - kafka links: - db:db - redis:redis - clickhouse: - image: yandex/clickhouse-server - ports: - - '8123:8123' - - '9000:9000' - - '9440:9440' - - '9009:9009' - volumes: - - ./idl:/idl - - ../docker/clickhouse/docker-entrypoint-initdb.d:/docker-entrypoint-initdb.d - zookeeper: - image: wurstmeister/zookeeper - kafka: - image: wurstmeister/kafka - ports: - - '9092:9092' - environment: - KAFKA_ADVERTISED_HOST_NAME: kafka - KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 + - clickhouse:clickhouse + - kafka:kafka diff --git a/ee/idl/omni_person.proto b/ee/idl/omni_person.proto deleted file mode 100644 index d327e1ac5f1b6..0000000000000 --- a/ee/idl/omni_person.proto +++ /dev/null @@ -1,12 +0,0 @@ -syntax = "proto3"; -import "google/protobuf/timestamp.proto"; - -message Person { - string uuid = 1; - string event_uuid = 2; - uint64 team_id = 3; - string distinct_id = 4; - string properties = 5; - bool is_identified = 6; - google.protobuf.Timestamp ts = 7; -} diff --git a/ee/idl/person_static_cohort.proto b/ee/idl/person_static_cohort.proto new file mode 100644 index 0000000000000..7900b602553ae --- /dev/null +++ b/ee/idl/person_static_cohort.proto @@ -0,0 +1,8 @@ +syntax = "proto3"; + +message PersonStaticCohort { + string id = 1; + string person_id = 2; + uint64 cohort_id = 3; + uint64 team_id = 4; +} \ No newline at end of file diff --git a/ee/kafka_client/client.py b/ee/kafka_client/client.py index 1d1554c0d0d2a..f678c9e125fce 100644 --- a/ee/kafka_client/client.py +++ b/ee/kafka_client/client.py @@ -2,47 +2,75 @@ import json from typing import Any, Callable, Dict, Optional -import kafka_helper from google.protobuf.internal.encoder import _VarintBytes # type: ignore from google.protobuf.json_format import MessageToJson +from kafka import KafkaConsumer as KC from kafka import KafkaProducer as KP from ee.clickhouse.client import async_execute, sync_execute +from ee.kafka_client import helper from ee.settings import KAFKA_ENABLED -from posthog.settings import IS_HEROKU, KAFKA_HOSTS, TEST +from posthog.settings import KAFKA_BASE64_KEYS, KAFKA_HOSTS, TEST from posthog.utils import SingletonDecorator +KAFKA_PRODUCER_RETRIES = 5 + class TestKafkaProducer: def __init__(self): pass - def send(self, topic: str, data: Any): + def send(self, topic: str, value: Any, key: Any = None): return def flush(self): return +class TestKafkaConsumer: + def __init__(self, topic="test", max=0, **kwargs): + self.max = max + self.n = 0 + self.topic = topic + + def __iter__(self): + return self + + def __next__(self): + if self.n <= self.max: + self.n += 1 + return f"message {self.n} from {self.topic} topic" + else: + raise StopIteration + + def seek_to_beginning(self): + return + + def seek_to_end(self): + return + + class _KafkaProducer: - def __init__(self): - if TEST: + def __init__(self, test=TEST): + if test: self.producer = TestKafkaProducer() - elif not IS_HEROKU: - self.producer = KP(bootstrap_servers=KAFKA_HOSTS) + elif KAFKA_BASE64_KEYS: + self.producer = helper.get_kafka_producer(retries=KAFKA_PRODUCER_RETRIES, value_serializer=lambda d: d) else: - self.producer = kafka_helper.get_kafka_producer(value_serializer=lambda d: d) + self.producer = KP(retries=KAFKA_PRODUCER_RETRIES, bootstrap_servers=KAFKA_HOSTS) @staticmethod def json_serializer(d): b = json.dumps(d).encode("utf-8") return b - def produce(self, topic: str, data: Any, value_serializer: Optional[Callable[[Any], Any]] = None): + def produce(self, topic: str, data: Any, key: Any = None, value_serializer: Optional[Callable[[Any], Any]] = None): if not value_serializer: value_serializer = self.json_serializer b = value_serializer(data) - self.producer.send(topic, b) + if key is not None: + key = key.encode("utf-8") + self.producer.send(topic, value=b) def close(self): self.producer.flush() @@ -51,9 +79,28 @@ def close(self): KafkaProducer = SingletonDecorator(_KafkaProducer) +def build_kafka_consumer( + topic: str, value_deserializer=lambda v: json.loads(v.decode("utf-8")), auto_offset_reset="latest", test=TEST +): + if test: + consumer = TestKafkaConsumer(topic=topic, auto_offset_reset=auto_offset_reset, max=10) + elif KAFKA_BASE64_KEYS: + consumer = helper.get_kafka_consumer( + topic=topic, auto_offset_reset=auto_offset_reset, value_deserializer=value_deserializer + ) + else: + consumer = KC( + topic, + bootstrap_servers=KAFKA_HOSTS, + auto_offset_reset=auto_offset_reset, + value_deserializer=value_deserializer, + ) + return consumer + + class ClickhouseProducer: - def __init__(self): - if KAFKA_ENABLED: + def __init__(self, kafka_enabled=KAFKA_ENABLED): + if kafka_enabled: self.send_to_kafka = True self.producer = KafkaProducer() else: diff --git a/ee/kafka_client/helper.py b/ee/kafka_client/helper.py new file mode 100644 index 0000000000000..b495e7de30f27 --- /dev/null +++ b/ee/kafka_client/helper.py @@ -0,0 +1,134 @@ +""" +Helper methods for creating the kafka-python KafkaProducer and KafkaConsumer objects. +https://github.com/heroku/kafka-helper +""" + +import base64 +import json +import os +import ssl +from tempfile import NamedTemporaryFile + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse # type: ignore + +from base64 import standard_b64encode + +from cryptography.hazmat.backends import default_backend +from cryptography.hazmat.primitives import serialization +from kafka import KafkaConsumer, KafkaProducer + + +def get_kafka_ssl_context(): + """ + Returns an SSL context based on the certificate information in the Kafka config vars. + """ + # NOTE: We assume that Kafka environment variables are present. If using + # Apache Kafka on Heroku, they will be available in your app configuration. + # + # 1. Write the PEM certificates necessary for connecting to the Kafka brokers to physical + # files. The broker connection SSL certs are passed in environment/config variables and + # the python and ssl libraries require them in physical files. The public keys are written + # to short lived NamedTemporaryFile files; the client key is encrypted before writing to + # the short lived NamedTemporaryFile + # + # 2. Create and return an SSLContext for connecting to the Kafka brokers referencing the + # PEM certificates written above + # + + # stash the kafka certs in named temporary files for loading into SSLContext. Initialize the + # SSLContext inside the with so when it goes out of scope the files are removed which has them + # existing for the shortest amount of time. As extra caution password + # protect/encrypt the client key + with NamedTemporaryFile(suffix=".crt") as cert_file, NamedTemporaryFile( + suffix=".key" + ) as key_file, NamedTemporaryFile(suffix=".crt") as trust_file: + cert_file.write(base64.b64decode(os.environ["KAFKA_CLIENT_CERT_B64"].encode("utf-8"))) + cert_file.flush() + + # setup cryptography to password encrypt/protect the client key so it's not in the clear on + # the filesystem. Use the generated password in the call to load_cert_chain + passwd = standard_b64encode(os.urandom(33)) + private_key = serialization.load_pem_private_key( + base64.b64decode(os.environ["KAFKA_CLIENT_CERT_KEY_B64"].encode("utf-8")), + password=None, + backend=default_backend(), + ) + pem = private_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.BestAvailableEncryption(passwd), + ) + key_file.write(pem) + key_file.flush() + + trust_file.write(base64.b64decode(os.environ["KAFKA_TRUSTED_CERT_B64"].encode("utf-8"))) + trust_file.flush() + + # create an SSLContext for passing into the kafka provider using the create_default_context + # function which creates an SSLContext with protocol set to PROTOCOL_SSLv23, OP_NO_SSLv2, + # and OP_NO_SSLv3 when purpose=SERVER_AUTH. + ssl_context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, cafile=trust_file.name) + ssl_context.load_cert_chain(cert_file.name, keyfile=key_file.name, password=passwd) + + # Intentionally disabling hostname checking. The Kafka cluster runs in the cloud and Apache + # Kafka on Heroku doesn't currently provide stable hostnames. We're pinned to a specific certificate + # for this connection even though the certificate doesn't include host information. We rely + # on the ca trust_cert for this purpose. + ssl_context.check_hostname = False + + return ssl_context + + +def get_kafka_brokers(): + """ + Parses the KAKFA_URL and returns a list of hostname:port pairs in the format + that kafka-python expects. + """ + # NOTE: The Kafka environment variables need to be present. If using + # Apache Kafka on Heroku, they will be available in your app configuration. + if not os.environ.get("KAFKA_URL"): + raise RuntimeError("The KAFKA_URL config variable is not set.") + + return [ + "{}:{}".format(parsedUrl.hostname, parsedUrl.port) + for parsedUrl in [urlparse(url) for url in os.environ.get("KAFKA_URL", "").split(",")] + ] + + +def get_kafka_producer(acks="all", value_serializer=lambda v: json.dumps(v).encode("utf-8"), **kwargs): + """ + Return a KafkaProducer that uses the SSLContext created with create_ssl_context. + """ + + producer = KafkaProducer( + bootstrap_servers=get_kafka_brokers(), + security_protocol="SSL", + ssl_context=get_kafka_ssl_context(), + value_serializer=value_serializer, + acks=acks, + **kwargs + ) + + return producer + + +def get_kafka_consumer(topic=None, value_deserializer=lambda v: json.loads(v.decode("utf-8")), **kwargs): + """ + Return a KafkaConsumer that uses the SSLContext created with create_ssl_context. + """ + + # Create the KafkaConsumer connected to the specified brokers. Use the + # SSLContext that is created with create_ssl_context. + consumer = KafkaConsumer( + topic, + bootstrap_servers=get_kafka_brokers(), + security_protocol="SSL", + ssl_context=get_kafka_ssl_context(), + value_deserializer=value_deserializer, + **kwargs + ) + + return consumer diff --git a/ee/kafka_client/test/test_client.py b/ee/kafka_client/test/test_client.py new file mode 100644 index 0000000000000..2181c58f76d6e --- /dev/null +++ b/ee/kafka_client/test/test_client.py @@ -0,0 +1,30 @@ +from django.test import TestCase + +from ee.kafka_client.client import _KafkaProducer, build_kafka_consumer + + +class KafkaClientTestCase(TestCase): + def setUp(self): + self.topic = "test_topic" + self.payload = {"foo": "bar"} + + def test_kafka_interface(self): + producer = _KafkaProducer(test=True) + consumer = build_kafka_consumer(topic=self.topic, test=True) + + producer.produce(topic=self.topic, data="any") + producer.close() + msg = next(consumer) + self.assertEqual(msg, "message 1 from test_topic topic") + + def test_kafka_produce(self): + producer = _KafkaProducer(test=False) + producer.produce(topic=self.topic, data=self.payload) + producer.close() + + def test_kafka_produce_and_consume(self): + producer = _KafkaProducer(test=False) + consumer = build_kafka_consumer(topic=self.topic, auto_offset_reset="earliest", test=False) + producer.produce(topic=self.topic, data=self.payload) + payload = next(consumer) + self.assertEqual(payload.value, self.payload) diff --git a/ee/kafka_client/topics.py b/ee/kafka_client/topics.py index 738d97c4d21c7..5c1c065bf0925 100644 --- a/ee/kafka_client/topics.py +++ b/ee/kafka_client/topics.py @@ -1,6 +1,12 @@ -KAFKA_EVENTS = "clickhouse_events_proto" -KAFKA_PERSON = "clickhouse_person" -KAFKA_PERSON_UNIQUE_ID = "clickhouse_person_unique_id" -KAFKA_SESSION_RECORDING_EVENTS = "clickhouse_session_recording_events" +from posthog.settings import TEST -KAFKA_EVENTS_WAL = "events_write_ahead_log" +suffix = "_test" if TEST else "" + +KAFKA_EVENTS = f"clickhouse_events_proto{suffix}" +KAFKA_PERSON = f"clickhouse_person{suffix}" +KAFKA_PERSON_UNIQUE_ID = f"clickhouse_person_unique_id{suffix}" +KAFKA_SESSION_RECORDING_EVENTS = f"clickhouse_session_recording_events{suffix}" +KAFKA_EVENTS_PLUGIN_INGESTION = f"events_plugin_ingestion{suffix}" +KAFKA_PLUGIN_LOG_ENTRIES = f"plugin_log_entries{suffix}" +KAFKA_DEAD_LETTER_QUEUE = f"events_dead_letter_queue{suffix}" +KAFKA_GROUPS = f"clickhouse_groups{suffix}" diff --git a/ee/management/commands/materialize_columns.py b/ee/management/commands/materialize_columns.py new file mode 100644 index 0000000000000..94689920db3d1 --- /dev/null +++ b/ee/management/commands/materialize_columns.py @@ -0,0 +1,72 @@ +import logging + +from django.core.management.base import BaseCommand + +from ee.clickhouse.materialized_columns import materialize +from ee.clickhouse.materialized_columns.analyze import logger, materialize_properties_task +from posthog.settings import ( + MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, + MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, + MATERIALIZE_COLUMNS_MAX_AT_ONCE, + MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, +) + + +class Command(BaseCommand): + help = "Materialize properties into columns in clickhouse" + + def add_arguments(self, parser): + parser.add_argument("--dry-run", action="store_true", help="Print plan instead of executing it") + + parser.add_argument("--property", help="Property to materialize. Skips analysis.") + parser.add_argument( + "--property-table", type=str, default="events", choices=["events", "person"], help="Table of --property" + ) + parser.add_argument( + "--backfill-period", + type=int, + default=MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS, + help="How many days worth of data to backfill. 0 to disable. Same as MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS env variable.", + ) + + parser.add_argument( + "--min-query-time", + type=int, + default=MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME, + help="Minimum query time (ms)9 before a query if considered for optimization. Same as MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME env variable.", + ) + parser.add_argument( + "--analyze-period", + type=int, + default=MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS, + help="How long of a time period to analyze. Same as MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS env variable.", + ) + parser.add_argument( + "--max-columns", + type=int, + default=MATERIALIZE_COLUMNS_MAX_AT_ONCE, + help="Max number of columns to materialize via single invocation. Same as MATERIALIZE_COLUMNS_MAX_AT_ONCE env variable.", + ) + + def handle(self, *args, **options): + logger.setLevel(logging.INFO) + + if options["dry_run"]: + logger.warn("Dry run: No changes to the tables will be made!") + + if options.get("property"): + logger.info(f"Materializing column. table={options['property_table']}, property_name={options['property']}") + + materialize_properties_task( + columns_to_materialize=[(options["property_table"], options["property"], 0)], + backfill_period_days=options["backfill_period"], + dry_run=options["dry_run"], + ) + else: + materialize_properties_task( + time_to_analyze_hours=options["analyze_period"], + maximum=options["max_columns"], + min_query_time=options["min_query_time"], + backfill_period_days=options["backfill_period"], + dry_run=options["dry_run"], + ) diff --git a/ee/management/commands/migrate_clickhouse.py b/ee/management/commands/migrate_clickhouse.py index 9d59d775316a2..78fd564586b9e 100644 --- a/ee/management/commands/migrate_clickhouse.py +++ b/ee/management/commands/migrate_clickhouse.py @@ -1,26 +1,88 @@ +import datetime +from textwrap import indent + from django.core.management.base import BaseCommand from infi.clickhouse_orm import Database # type: ignore +from infi.clickhouse_orm.migrations import MigrationHistory # type: ignore +from infi.clickhouse_orm.utils import import_submodules # type: ignore from posthog.settings import ( CLICKHOUSE_DATABASE, CLICKHOUSE_HTTP_URL, CLICKHOUSE_PASSWORD, - CLICKHOUSE_USERNAME, - CLICKHOUSE_VERIFY, + CLICKHOUSE_REPLICATION, + CLICKHOUSE_USER, ) +MIGRATIONS_PACKAGE_NAME = "ee.clickhouse.migrations" + class Command(BaseCommand): help = "Migrate clickhouse" + def add_arguments(self, parser): + parser.add_argument( + "--upto", default=99_999, type=int, help="Database state will be brought to the state after that migration." + ) + parser.add_argument("--fake", action="store_true", help="Mark migrations as run without actually running them.") + parser.add_argument( + "--plan", action="store_true", help="Shows a list of the migration actions that will be performed." + ) + parser.add_argument( + "--print-sql", + action="store_true", + help="Only use with --plan. Also prints SQL for each migration to be applied.", + ) + def handle(self, *args, **options): - try: - Database( - CLICKHOUSE_DATABASE, - db_url=CLICKHOUSE_HTTP_URL, - username=CLICKHOUSE_USERNAME, - password=CLICKHOUSE_PASSWORD, - verify_ssl_cert=False, - ).migrate("ee.clickhouse.migrations") - except Exception as e: - print(e) + self.migrate(CLICKHOUSE_HTTP_URL, options) + + def migrate(self, host, options): + database = Database( + CLICKHOUSE_DATABASE, + db_url=host, + username=CLICKHOUSE_USER, + password=CLICKHOUSE_PASSWORD, + verify_ssl_cert=False, + ) + + if options["plan"]: + print("List of clickhouse migrations to be applied:") + migrations = list(self.get_migrations(database, options["upto"])) + for migration_name, operations in migrations: + print(f"Migration would get applied: {migration_name}") + for op in operations: + sql = getattr(op, "_sql") + if options["print_sql"] and sql is not None: + print(indent("\n\n".join(sql), " ")) + if len(migrations) == 0: + print("Clickhouse migrations up to date!") + elif options["fake"]: + for migration_name, _ in self.get_migrations(database, options["upto"]): + print(f"Faked migration: {migration_name}") + database.insert( + [ + MigrationHistory( + package_name=MIGRATIONS_PACKAGE_NAME, + module_name=migration_name, + applied=datetime.date.today(), + ) + ] + ) + print("Migrations done") + else: + database.migrate(MIGRATIONS_PACKAGE_NAME, options["upto"], replicated=CLICKHOUSE_REPLICATION) + print("Migration successful") + + def get_migrations(self, database, upto): + applied_migrations = database._get_applied_migrations( + MIGRATIONS_PACKAGE_NAME, replicated=CLICKHOUSE_REPLICATION + ) + modules = import_submodules(MIGRATIONS_PACKAGE_NAME) + unapplied_migrations = set(modules.keys()) - applied_migrations + + for migration_name in sorted(unapplied_migrations): + yield migration_name, modules[migration_name].operations + + if int(migration_name[:4]) >= upto: + break diff --git a/ee/management/commands/setup_test_environment.py b/ee/management/commands/setup_test_environment.py new file mode 100644 index 0000000000000..329085772e720 --- /dev/null +++ b/ee/management/commands/setup_test_environment.py @@ -0,0 +1,40 @@ +from django.core.management.base import BaseCommand + +from posthog.utils import is_clickhouse_enabled + + +class Command(BaseCommand): + help = "Set up databases for non-Python tests that depend on the Django server" + + def handle(self, *args, **options): + from django.test.runner import DiscoverRunner as TestRunner + + test_runner = TestRunner(interactive=False) + test_runner.setup_databases() + test_runner.setup_test_environment() + + if is_clickhouse_enabled(): + from infi.clickhouse_orm import Database # type: ignore + + from posthog.settings import ( + CLICKHOUSE_DATABASE, + CLICKHOUSE_HTTP_URL, + CLICKHOUSE_PASSWORD, + CLICKHOUSE_REPLICATION, + CLICKHOUSE_USER, + CLICKHOUSE_VERIFY, + ) + + database = Database( + CLICKHOUSE_DATABASE, + db_url=CLICKHOUSE_HTTP_URL, + username=CLICKHOUSE_USER, + password=CLICKHOUSE_PASSWORD, + verify_ssl_cert=CLICKHOUSE_VERIFY, + ) + + try: + database.create_database() + except: + pass + database.migrate("ee.clickhouse.migrations", replicated=CLICKHOUSE_REPLICATION) diff --git a/ee/migrations/0003_license_max_users.py b/ee/migrations/0003_license_max_users.py new file mode 100644 index 0000000000000..bf04e997d4441 --- /dev/null +++ b/ee/migrations/0003_license_max_users.py @@ -0,0 +1,16 @@ +# Generated by Django 3.0.11 on 2021-04-14 00:20 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("ee", "0002_hook"), + ] + + operations = [ + migrations.AddField( + model_name="license", name="max_users", field=models.IntegerField(default=None, null=True), + ), + ] diff --git a/ee/migrations/0004_enterpriseeventdefinition_enterprisepropertydefinition.py b/ee/migrations/0004_enterpriseeventdefinition_enterprisepropertydefinition.py new file mode 100644 index 0000000000000..4c7a93e59bf4c --- /dev/null +++ b/ee/migrations/0004_enterpriseeventdefinition_enterprisepropertydefinition.py @@ -0,0 +1,91 @@ +# Generated by Django 3.1.8 on 2021-06-02 19:42 + +import django.contrib.postgres.fields +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("posthog", "0156_insight_short_id"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ("ee", "0003_license_max_users"), + ] + + operations = [ + migrations.CreateModel( + name="EnterprisePropertyDefinition", + fields=[ + ( + "propertydefinition_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="posthog.propertydefinition", + ), + ), + ("description", models.CharField(blank=True, max_length=400)), + ( + "tags", + django.contrib.postgres.fields.ArrayField( + base_field=models.CharField(max_length=32), blank=True, default=list, null=True, size=None + ), + ), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "updated_by", + models.ForeignKey( + blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL + ), + ), + ], + options={"abstract": False,}, + bases=("posthog.propertydefinition",), + ), + migrations.CreateModel( + name="EnterpriseEventDefinition", + fields=[ + ( + "eventdefinition_ptr", + models.OneToOneField( + auto_created=True, + on_delete=django.db.models.deletion.CASCADE, + parent_link=True, + primary_key=True, + serialize=False, + to="posthog.eventdefinition", + ), + ), + ("description", models.CharField(blank=True, max_length=400)), + ( + "tags", + django.contrib.postgres.fields.ArrayField( + base_field=models.CharField(max_length=32), blank=True, default=list, null=True, size=None + ), + ), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "owner", + models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="event_definitions", + to=settings.AUTH_USER_MODEL, + ), + ), + ( + "updated_by", + models.ForeignKey( + blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL + ), + ), + ], + options={"abstract": False,}, + bases=("posthog.eventdefinition",), + ), + ] diff --git a/ee/migrations/0005_project_based_permissioning.py b/ee/migrations/0005_project_based_permissioning.py new file mode 100644 index 0000000000000..42e3ba203abbc --- /dev/null +++ b/ee/migrations/0005_project_based_permissioning.py @@ -0,0 +1,57 @@ +# Generated by Django 3.2.5 on 2021-09-10 11:39 + +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + +import posthog.models.utils + + +class Migration(migrations.Migration): + + dependencies = [ + ("posthog", "0170_project_based_permissioning"), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ("ee", "0004_enterpriseeventdefinition_enterprisepropertydefinition"), + ] + + operations = [ + migrations.CreateModel( + name="ExplicitTeamMembership", + fields=[ + ( + "id", + models.UUIDField( + default=posthog.models.utils.UUIDT, editable=False, primary_key=True, serialize=False + ), + ), + ("level", models.PositiveSmallIntegerField(choices=[(1, "member"), (8, "administrator")], default=1)), + ("joined_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "team", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="explicit_memberships", + related_query_name="explicit_membership", + to="posthog.team", + ), + ), + ( + "parent_membership", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="explicit_team_memberships", + related_query_name="explicit_team_membership", + to="posthog.organizationmembership", + ), + ), + ], + ), + migrations.AddConstraint( + model_name="explicitteammembership", + constraint=models.UniqueConstraint( + fields=("team", "parent_membership"), name="unique_explicit_team_membership" + ), + ), + ] diff --git a/ee/models/__init__.py b/ee/models/__init__.py index ed98e70491db7..552ab35c64c08 100644 --- a/ee/models/__init__.py +++ b/ee/models/__init__.py @@ -1,2 +1,5 @@ +from .event_definition import EventDefinition +from .explicit_team_membership import ExplicitTeamMembership from .hook import Hook from .license import License +from .property_definition import PropertyDefinition diff --git a/ee/models/event_definition.py b/ee/models/event_definition.py new file mode 100644 index 0000000000000..d61b76c02fa13 --- /dev/null +++ b/ee/models/event_definition.py @@ -0,0 +1,12 @@ +from django.contrib.postgres.fields import ArrayField +from django.db import models + +from posthog.models.event_definition import EventDefinition + + +class EnterpriseEventDefinition(EventDefinition): + owner = models.ForeignKey("posthog.User", null=True, on_delete=models.SET_NULL, related_name="event_definitions",) + description: models.CharField = models.CharField(max_length=400, blank=True) + tags: ArrayField = ArrayField(models.CharField(max_length=32), null=True, blank=True, default=list) + updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) + updated_by = models.ForeignKey("posthog.User", null=True, on_delete=models.SET_NULL, blank=True) diff --git a/ee/models/explicit_team_membership.py b/ee/models/explicit_team_membership.py new file mode 100644 index 0000000000000..f70c2904f29d6 --- /dev/null +++ b/ee/models/explicit_team_membership.py @@ -0,0 +1,50 @@ +from typing import TYPE_CHECKING + +from django.db import models + +from posthog.models.utils import UUIDModel, sane_repr + +if TYPE_CHECKING: + from posthog.models.organization import OrganizationMembership + + +class ExplicitTeamMembership(UUIDModel): + class Level(models.IntegerChoices): + """Keep in sync with OrganizationMembership.Level (only difference being organizations having an Owner).""" + + MEMBER = 1, "member" + ADMIN = 8, "administrator" + + team: models.ForeignKey = models.ForeignKey( + "posthog.Team", + on_delete=models.CASCADE, + related_name="explicit_memberships", + related_query_name="explicit_membership", + ) + parent_membership: models.ForeignKey = models.ForeignKey( + "posthog.OrganizationMembership", + on_delete=models.CASCADE, + related_name="explicit_team_memberships", + related_query_name="explicit_team_membership", + ) + level: models.PositiveSmallIntegerField = models.PositiveSmallIntegerField( + default=Level.MEMBER, choices=Level.choices + ) + joined_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) + updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) + + class Meta: + constraints = [ + models.UniqueConstraint(fields=["team", "parent_membership"], name="unique_explicit_team_membership"), + ] + + def __str__(self): + return str(self.Level(self.level)) + + @property + def effective_level(self) -> "OrganizationMembership.Level": + """If organization level is higher than project level, then that takes precedence over explicit project level. + """ + return max(self.level, self.parent_membership.level) + + __repr__ = sane_repr("team", "parent_membership", "level") diff --git a/ee/models/hook.py b/ee/models/hook.py index 08a62eee8f946..41de5423f4aed 100644 --- a/ee/models/hook.py +++ b/ee/models/hook.py @@ -1,10 +1,11 @@ -import json from typing import Optional from django.db import models from rest_hooks.models import AbstractHook +from statshog.defaults.django import statsd from ee.tasks.hooks import DeliverHook +from posthog.constants import AvailableFeature from posthog.models.team import Team from posthog.models.utils import generate_random_token @@ -17,18 +18,17 @@ class Hook(AbstractHook): def find_and_fire_hook( - event_name: str, - instance: models.Model, - user_override: Optional[Team] = None, - payload_override: Optional[dict] = None, + event_name: str, instance: models.Model, user_override: Team, payload_override: Optional[dict] = None, ): - hooks = Hook.objects.select_related("user").filter(event=event_name, team=user_override) + if not user_override.organization.is_feature_available(AvailableFeature.ZAPIER): + return + hooks = Hook.objects.filter(event=event_name, team=user_override) if event_name == "action_performed": # action_performed is a resource_id-filterable hook - hooks = hooks.filter(models.Q(resource_id=instance.pk) | models.Q(resource_id__isnull=True)) + hooks = hooks.filter(models.Q(resource_id=instance.pk)) for hook in hooks: - if hook.team.organization.is_feature_available("zapier"): - hook.deliver_hook(instance, payload_override) + statsd.incr("posthog_cloud_hooks_rest_fired") + hook.deliver_hook(instance, payload_override) def deliver_hook_wrapper(target, payload, instance, hook): diff --git a/ee/models/license.py b/ee/models/license.py index 0fba21ea843c4..322b69caf3d29 100644 --- a/ee/models/license.py +++ b/ee/models/license.py @@ -1,20 +1,30 @@ -from typing import Any +from typing import Any, List, Optional, cast import requests +from django.contrib.auth import get_user_model from django.db import models +from django.db.models.signals import post_save +from django.dispatch.dispatcher import receiver +from django.utils import timezone +from rest_framework import exceptions, status +from posthog.celery import sync_all_organization_available_features +from posthog.constants import AvailableFeature -class LicenseError(Exception): - """Exception raised for licensing errors. - Attributes: - code -- code of the exception - detail -- message of the exception +class LicenseError(exceptions.APIException): """ + Exception raised for licensing errors. + """ + + default_type = "license_error" + default_code = "license_error" + status_code = status.HTTP_400_BAD_REQUEST + default_detail = "There was a problem with your current license." def __init__(self, code, detail): self.code = code - self.detail = detail + self.detail = exceptions._get_error_details(detail, code) class LicenseManager(models.Manager): @@ -26,31 +36,65 @@ def create(self, *args: Any, **kwargs: Any) -> "License": kwargs["valid_until"] = resp["valid_until"] kwargs["plan"] = resp["plan"] - return self._create(*args, **kwargs) + kwargs["max_users"] = resp.get("max_users", 0) + return cast(License, super().create(*args, **kwargs)) - def _create(self, *args: Any, **kwargs: Any) -> "License": - return super().create(*args, **kwargs) + def first_valid(self) -> Optional["License"]: + return cast(Optional[License], (self.filter(valid_until__gte=timezone.now()).first())) class License(models.Model): - objects = LicenseManager() + objects: LicenseManager = LicenseManager() created_at: models.DateTimeField = models.DateTimeField(auto_now_add=True) plan: models.CharField = models.CharField(max_length=200) valid_until: models.DateTimeField = models.DateTimeField() key: models.CharField = models.CharField(max_length=200) + max_users: models.IntegerField = models.IntegerField(default=None, null=True) # None = no restriction - # TODO: This logic should go on posthog-production (requires abstraction on models/organization.py) - STARTER_PLAN = "starter" # cloud - GROWTH_PLAN = "growth" # cloud - STARTUP_PLAN = "startup" # cloud - STARTER_FEATURES = ["organizations_projects"] + SCALE_PLAN = "scale" + SCALE_FEATURES = [ + AvailableFeature.ZAPIER, + AvailableFeature.ORGANIZATIONS_PROJECTS, + AvailableFeature.PROJECT_BASED_PERMISSIONING, + AvailableFeature.GOOGLE_LOGIN, + AvailableFeature.DASHBOARD_COLLABORATION, + AvailableFeature.INGESTION_TAXONOMY, + AvailableFeature.PATHS_ADVANCED, + AvailableFeature.CORRELATION_ANALYSIS, + ] ENTERPRISE_PLAN = "enterprise" - ENTERPRISE_FEATURES = ["zapier", "organizations_projects"] - PLANS = { - ENTERPRISE_PLAN: ENTERPRISE_FEATURES, - STARTER_PLAN: STARTER_FEATURES, - GROWTH_PLAN: ENTERPRISE_FEATURES, - STARTUP_PLAN: ENTERPRISE_FEATURES, - } + ENTERPRISE_FEATURES = SCALE_FEATURES + [ + AvailableFeature.SAML, + ] + PLANS = {SCALE_PLAN: SCALE_FEATURES, ENTERPRISE_PLAN: ENTERPRISE_FEATURES} + + @property + def available_features(self) -> List[AvailableFeature]: + return self.PLANS.get(self.plan, []) + + +def get_licensed_users_available() -> Optional[int]: + """ + Returns the number of user slots available that can be created based on the instance's current license. + Not relevant for cloud users. + `None` means unlimited users. + """ + + license = License.objects.first_valid() + from posthog.models import OrganizationInvite + + if license: + if license.max_users is None: + return None + + users_left = license.max_users - get_user_model().objects.count() - OrganizationInvite.objects.count() + return max(users_left, 0) + + return None + + +@receiver(post_save, sender=License) +def license_saved(sender, instance, created, raw, using, **kwargs): + sync_all_organization_available_features() diff --git a/ee/models/property_definition.py b/ee/models/property_definition.py new file mode 100644 index 0000000000000..6d7d2a7e4f5fd --- /dev/null +++ b/ee/models/property_definition.py @@ -0,0 +1,11 @@ +from django.contrib.postgres.fields import ArrayField +from django.db import models + +from posthog.models.property_definition import PropertyDefinition + + +class EnterprisePropertyDefinition(PropertyDefinition): + description: models.CharField = models.CharField(max_length=400, blank=True) + tags: ArrayField = ArrayField(models.CharField(max_length=32), null=True, blank=True, default=list) + updated_at: models.DateTimeField = models.DateTimeField(auto_now=True) + updated_by = models.ForeignKey("posthog.User", null=True, on_delete=models.SET_NULL, blank=True) diff --git a/ee/settings.py b/ee/settings.py index e39107226392b..aebc0838afbef 100644 --- a/ee/settings.py +++ b/ee/settings.py @@ -1,11 +1,14 @@ """ Django settings for PostHog Enterprise Edition. """ -from typing import Dict +import os +from typing import Dict, List -from posthog.settings import CLICKHOUSE, PRIMARY_DB, TEST +from posthog.constants import AnalyticsDBMS +from posthog.settings import AUTHENTICATION_BACKENDS, PRIMARY_DB, SITE_URL, TEST, get_from_env +from posthog.utils import str_to_bool -# Zapier +# Zapier REST hooks HOOK_EVENTS: Dict[str, str] = { # "event_name": "App.Model.Action" (created/updated/deleted) "action_defined": "posthog.Action.created_custom", @@ -15,4 +18,70 @@ HOOK_FINDER = "ee.models.hook.find_and_fire_hook" HOOK_DELIVERER = "ee.models.hook.deliver_hook_wrapper" -KAFKA_ENABLED = PRIMARY_DB == CLICKHOUSE and not TEST +# Social auth +SOCIAL_AUTH_GOOGLE_OAUTH2_KEY = os.getenv("SOCIAL_AUTH_GOOGLE_OAUTH2_KEY") +SOCIAL_AUTH_GOOGLE_OAUTH2_SECRET = os.getenv("SOCIAL_AUTH_GOOGLE_OAUTH2_SECRET") +if "SOCIAL_AUTH_GOOGLE_OAUTH2_WHITELISTED_DOMAINS" in os.environ: + SOCIAL_AUTH_GOOGLE_OAUTH2_WHITELISTED_DOMAINS: List[str] = os.environ[ + "SOCIAL_AUTH_GOOGLE_OAUTH2_WHITELISTED_DOMAINS" + ].split(",") + +AUTHENTICATION_BACKENDS = AUTHENTICATION_BACKENDS + [ + "social_core.backends.google.GoogleOAuth2", +] + +# SAML +SAML_DISABLED = get_from_env("SAML_DISABLED", False, type_cast=str_to_bool) +SAML_CONFIGURED = False +SOCIAL_AUTH_SAML_SP_ENTITY_ID = SITE_URL +SOCIAL_AUTH_SAML_SECURITY_CONFIG = { + "wantAttributeStatement": False, # AttributeStatement is optional in the specification +} +# Attributes below are required for the SAML integration from social_core to work properly +SOCIAL_AUTH_SAML_SP_PUBLIC_CERT = "" +SOCIAL_AUTH_SAML_SP_PRIVATE_KEY = "" +SOCIAL_AUTH_SAML_ORG_INFO = {"en-US": {"name": "posthog", "displayname": "PostHog", "url": "https://posthog.com"}} +SOCIAL_AUTH_SAML_TECHNICAL_CONTACT = {"givenName": "PostHog Support", "emailAddress": "hey@posthog.com"} +SOCIAL_AUTH_SAML_SUPPORT_CONTACT = SOCIAL_AUTH_SAML_TECHNICAL_CONTACT + +# Set settings only if SAML is enabled +if not SAML_DISABLED and os.getenv("SAML_ENTITY_ID") and os.getenv("SAML_ACS_URL") and os.getenv("SAML_X509_CERT"): + SAML_CONFIGURED = True + AUTHENTICATION_BACKENDS = AUTHENTICATION_BACKENDS + [ + "social_core.backends.saml.SAMLAuth", + ] + SOCIAL_AUTH_SAML_ENABLED_IDPS = { + "posthog_custom": { + "entity_id": os.getenv("SAML_ENTITY_ID"), + "url": os.getenv("SAML_ACS_URL"), + "x509cert": os.getenv("SAML_X509_CERT"), + "attr_user_permanent_id": os.getenv("SAML_ATTR_PERMANENT_ID", "name_id"), + "attr_first_name": os.getenv("SAML_ATTR_FIRST_NAME", "first_name"), + "attr_last_name": os.getenv("SAML_ATTR_LAST_NAME", "last_name"), + "attr_email": os.getenv("SAML_ATTR_EMAIL", "email"), + }, + } + SAML_ENFORCED = get_from_env("SAML_ENFORCED", False, type_cast=str_to_bool) + + +# ClickHouse and Kafka +KAFKA_ENABLED = PRIMARY_DB == AnalyticsDBMS.CLICKHOUSE and not TEST + +# Settings specific for materialized columns + +# Whether materialized columns should be created or used at query time +MATERIALIZED_COLUMNS_ENABLED = get_from_env("MATERIALIZED_COLUMNS_ENABLED", True, type_cast=str_to_bool) + +# Schedule to run column materialization on. Follows crontab syntax. +# Use empty string to prevent from materializing +MATERIALIZE_COLUMNS_SCHEDULE_CRON = get_from_env("MATERIALIZE_COLUMNS_SCHEDULE_CRON", "0 5 * * SAT") +# Minimum query time before a query if considered for optimization by adding materialized columns +MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME = get_from_env("MATERIALIZE_COLUMNS_MINIMUM_QUERY_TIME", 3000, type_cast=int) +# How many hours backwards to look for queries to optimize +MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS = get_from_env( + "MATERIALIZE_COLUMNS_ANALYSIS_PERIOD_HOURS", 7 * 24, type_cast=int +) +# How big of a timeframe to backfill when materializing event properties. 0 for no backfilling +MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS = get_from_env("MATERIALIZE_COLUMNS_BACKFILL_PERIOD_DAYS", 90, type_cast=int) +# Maximum number of columns to materialize at once. Avoids running into resource bottlenecks (storage + ingest + backfilling). +MATERIALIZE_COLUMNS_MAX_AT_ONCE = get_from_env("MATERIALIZE_COLUMNS_MAX_AT_ONCE", 10, type_cast=int) diff --git a/ee/tasks/hooks.py b/ee/tasks/hooks.py index f03c5c6667eb0..7fdf293e0a64c 100644 --- a/ee/tasks/hooks.py +++ b/ee/tasks/hooks.py @@ -1,10 +1,8 @@ import json -from typing import Optional import requests from celery.task import Task from django.core.serializers.json import DjangoJSONEncoder -from rest_hooks.utils import get_hook_model class DeliverHook(Task): @@ -20,7 +18,8 @@ def run(self, target: str, payload: dict, hook_id: str) -> None: ) if response.status_code == 410 and hook_id: # Delete hook on our side if it's gone on Zapier's - Hook = get_hook_model() + from ee.models.hook import Hook + Hook.objects.filter(id=hook_id).delete() return if response.status_code >= 500: diff --git a/ee/tasks/materialized_columns.py b/ee/tasks/materialized_columns.py new file mode 100644 index 0000000000000..5c35047b9d197 --- /dev/null +++ b/ee/tasks/materialized_columns.py @@ -0,0 +1,50 @@ +from celery.utils.log import get_task_logger + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.materialized_columns.columns import TRIM_AND_EXTRACT_PROPERTY, ColumnName, get_materialized_columns +from posthog.settings import CLICKHOUSE_CLUSTER, CLICKHOUSE_DATABASE, CLICKHOUSE_REPLICATION + +logger = get_task_logger(__name__) + + +def mark_all_materialized() -> None: + if any_ongoing_mutations(): + logger.info("There are running mutations, skipping marking as materialized") + return + + for table, property_name, column_name in get_materialized_columns_with_default_expression(): + updated_table = "sharded_events" if CLICKHOUSE_REPLICATION and table == "events" else table + + # :TRICKY: On cloud, we ON CLUSTER updates to events/sharded_events but not to persons. Why? ¯\_(ツ)_/¯ + execute_on_cluster = f"ON CLUSTER {CLICKHOUSE_CLUSTER}" if table == "events" else "" + + sync_execute( + f""" + ALTER TABLE {updated_table} + {execute_on_cluster} + MODIFY COLUMN + {column_name} VARCHAR MATERIALIZED {TRIM_AND_EXTRACT_PROPERTY} + """, + {"property": property_name}, + ) + + +def get_materialized_columns_with_default_expression(): + for table in ["events", "person"]: + materialized_columns = get_materialized_columns(table, use_cache=False) + for property_name, column_name in materialized_columns.items(): + if is_default_expression(table, column_name): + yield table, property_name, column_name + + +def any_ongoing_mutations() -> bool: + running_mutations_count = sync_execute("SELECT count(*) FROM system.mutations WHERE is_done = 0")[0][0] + return running_mutations_count > 0 + + +def is_default_expression(table: str, column_name: ColumnName) -> bool: + column_query = sync_execute( + "SELECT default_kind FROM system.columns WHERE table = %(table)s AND name = %(name)s AND database = %(database)s", + {"table": table, "name": column_name, "database": CLICKHOUSE_DATABASE,}, + ) + return len(column_query) > 0 and column_query[0][0] == "DEFAULT" diff --git a/ee/tasks/org_usage_report.py b/ee/tasks/org_usage_report.py new file mode 100644 index 0000000000000..1ba280bdb8123 --- /dev/null +++ b/ee/tasks/org_usage_report.py @@ -0,0 +1,11 @@ +from typing import List + +from posthog.tasks.org_usage_report import OrgReport, send_all_reports + + +def send_all_org_usage_reports(*, dry_run: bool = False) -> List[OrgReport]: + """ + Creates and sends usage reports for all teams. + Returns a list of all the successfully sent reports. + """ + return send_all_reports(dry_run=dry_run, data_source="clickhouse") diff --git a/ee/tasks/send_license_usage.py b/ee/tasks/send_license_usage.py new file mode 100644 index 0000000000000..2f773d127bda3 --- /dev/null +++ b/ee/tasks/send_license_usage.py @@ -0,0 +1,63 @@ +import posthoganalytics +import requests +from dateutil.relativedelta import relativedelta +from django.utils import timezone + +from ee.clickhouse.client import sync_execute +from ee.models.license import License +from posthog.models import User +from posthog.tasks.status_report import get_instance_licenses + + +def send_license_usage(): + license = License.objects.first_valid() + user = User.objects.first() + if not license: + return + try: + date_from = (timezone.now() - relativedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0) + date_to = timezone.now().replace(hour=0, minute=0, second=0, microsecond=0) + events_count = sync_execute( + "select count(1) from events where timestamp >= %(date_from)s and timestamp < %(date_to)s and not startsWith(event, '$$')", + {"date_from": date_from, "date_to": date_to}, + )[0][0] + response = requests.post( + "https://license.posthog.com/licenses/usage", + data={"date": date_from.strftime("%Y-%m-%d"), "key": license.key, "events_count": events_count,}, + ) + + response.raise_for_status() + if not response.ok: + posthoganalytics.capture( + user.distinct_id, # type: ignore + "send license usage data error", + { + "error": response.content, + "status_code": response.status_code, + "date": date_from.strftime("%Y-%m-%d"), + "events_count": events_count, + "organization_name": user.current_organization.name, # type: ignore + }, + ) + return + + posthoganalytics.capture( + user.distinct_id, # type: ignore + "send license usage data", + { + "date": date_from.strftime("%Y-%m-%d"), + "events_count": events_count, + "license_keys": get_instance_licenses(), + "organization_name": user.current_organization.name, # type: ignore + }, + ) + except Exception as err: + posthoganalytics.capture( + user.distinct_id, # type: ignore + "send license usage data error", + { + "error": str(err), + "date": date_from.strftime("%Y-%m-%d"), + "organization_name": user.current_organization.name, # type: ignore + }, + ) diff --git a/ee/tasks/test/test_calculate_cohort.py b/ee/tasks/test/test_calculate_cohort.py new file mode 100644 index 0000000000000..b63ca534246c7 --- /dev/null +++ b/ee/tasks/test/test_calculate_cohort.py @@ -0,0 +1,233 @@ +from unittest.mock import patch +from uuid import uuid4 + +from freezegun import freeze_time + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseTestMixin +from posthog.models.cohort import Cohort +from posthog.models.person import Person +from posthog.tasks.calculate_cohort import insert_cohort_from_query +from posthog.tasks.test.test_calculate_cohort import calculate_cohort_test_factory + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +def _create_person(**kwargs): + person = Person.objects.create(**kwargs) + return Person(id=person.uuid) + + +class TestClickhouseCalculateCohort(ClickhouseTestMixin, calculate_cohort_test_factory(_create_event, _create_person)): # type: ignore + @patch("posthog.tasks.calculate_cohort.insert_cohort_from_query.delay") + def test_create_stickiness_cohort(self, _insert_cohort_from_query): + _create_person(team_id=self.team.pk, distinct_ids=["blabla"]) + _create_event( + team=self.team, + event="$pageview", + distinct_id="blabla", + properties={"$math_prop": 1}, + timestamp="2021-01-01T12:00:00Z", + ) + response = self.client.post( + f"/api/projects/{self.team.id}/cohorts/?insight=STICKINESS&properties=%5B%5D&interval=day&display=ActionsLineGraph&events=%5B%7B%22id%22%3A%22%24pageview%22%2C%22name%22%3A%22%24pageview%22%2C%22type%22%3A%22events%22%2C%22order%22%3A0%7D%5D&shown_as=Stickiness&date_from=2021-01-01&entity_id=%24pageview&entity_type=events&stickiness_days=1&label=%24pageview", + {"name": "test", "is_static": True}, + ).json() + + cohort_id = response["id"] + + _insert_cohort_from_query.assert_called_once_with( + cohort_id, + "STICKINESS", + { + "date_from": "2021-01-01", + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "name": "$pageview", + "custom_name": None, + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + } + ], + "insight": "STICKINESS", + "interval": "day", + "selected_interval": 1, + "shown_as": "Stickiness", + }, + entity_data={ + "id": "$pageview", + "type": "events", + "order": None, + "name": "$pageview", + "custom_name": None, + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + }, + ) + insert_cohort_from_query( + cohort_id, + "STICKINESS", + { + "date_from": "2021-01-01", + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "name": "$pageview", + "custom_name": None, + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + } + ], + "insight": "STICKINESS", + "interval": "day", + "selected_interval": 1, + "shown_as": "Stickiness", + }, + entity_data={ + "id": "$pageview", + "type": "events", + "order": None, + "name": "$pageview", + "custom_name": None, + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + }, + ) + cohort = Cohort.objects.get(pk=cohort_id) + people = Person.objects.filter(cohort__id=cohort.pk) + self.assertEqual(len(people), 1) + + @patch("posthog.tasks.calculate_cohort.insert_cohort_from_query.delay") + def test_create_trends_cohort(self, _insert_cohort_from_query): + _create_person(team_id=self.team.pk, distinct_ids=["blabla"]) + with freeze_time("2021-01-01 00:06:34"): + _create_event( + team=self.team, + event="$pageview", + distinct_id="blabla", + properties={"$math_prop": 1}, + timestamp="2021-01-01T12:00:00Z", + ) + + with freeze_time("2021-01-02 00:06:34"): + _create_event( + team=self.team, + event="$pageview", + distinct_id="blabla", + properties={"$math_prop": 4}, + timestamp="2021-01-01T12:00:00Z", + ) + + response = self.client.post( + f"/api/projects/{self.team.id}/cohorts/?interval=day&display=ActionsLineGraph&events=%5B%7B%22id%22%3A%22%24pageview%22%2C%22name%22%3A%22%24pageview%22%2C%22type%22%3A%22events%22%2C%22order%22%3A0%7D%5D&properties=%5B%5D&entity_id=%24pageview&entity_type=events&date_from=2021-01-01&date_to=2021-01-01&label=%24pageview", + {"name": "test", "is_static": True}, + ).json() + cohort_id = response["id"] + _insert_cohort_from_query.assert_called_once_with( + cohort_id, + "TRENDS", + { + "date_from": "2021-01-01", + "date_to": "2021-01-01", + "display": "ActionsLineGraph", + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "name": "$pageview", + "custom_name": None, + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + } + ], + "entity_id": "$pageview", + "entity_type": "events", + "insight": "TRENDS", + "interval": "day", + }, + entity_data={ + "id": "$pageview", + "type": "events", + "order": None, + "name": "$pageview", + "custom_name": None, + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + }, + ) + insert_cohort_from_query( + cohort_id, + "TRENDS", + { + "date_from": "2021-01-01", + "date_to": "2021-01-01", + "display": "ActionsLineGraph", + "events": [ + { + "id": "$pageview", + "type": "events", + "order": 0, + "name": "$pageview", + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + } + ], + "entity_id": "$pageview", + "entity_type": "events", + "insight": "TRENDS", + "interval": "day", + }, + entity_data={ + "id": "$pageview", + "type": "events", + "order": 0, + "name": "$pageview", + "math": None, + "math_property": None, + "math_group_type_index": None, + "properties": [], + }, + ) + cohort = Cohort.objects.get(pk=cohort_id) + people = Person.objects.filter(cohort__id=cohort.pk) + self.assertEqual(cohort.errors_calculating, 0) + self.assertEqual( + len(people), + 1, + { + "a": sync_execute( + "select person_id from person_static_cohort where team_id = {} and cohort_id = {} ".format( + self.team.id, cohort.pk + ) + ), + "b": sync_execute( + "select person_id from person_static_cohort FINAL where team_id = {} and cohort_id = {} ".format( + self.team.id, cohort.pk + ) + ), + }, + ) diff --git a/ee/tasks/test/test_org_usage_report.py b/ee/tasks/test/test_org_usage_report.py new file mode 100644 index 0000000000000..e61277557276c --- /dev/null +++ b/ee/tasks/test/test_org_usage_report.py @@ -0,0 +1,28 @@ +from uuid import uuid4 + +from django.utils.timezone import datetime + +from ee.clickhouse.models.event import create_event +from ee.tasks.org_usage_report import send_all_org_usage_reports +from posthog.constants import AnalyticsDBMS +from posthog.models import Person, Team +from posthog.tasks.test.test_org_usage_report import factory_org_usage_report + + +def create_person(distinct_id: str, team: Team) -> Person: + return Person.objects.create(team=team, distinct_ids=[distinct_id]) + + +def create_event_clickhouse(distinct_id: str, event: str, lib: str, created_at: datetime, team: Team) -> None: + create_event( + event_uuid=uuid4(), + team=team, + distinct_id=distinct_id, + event=event, + timestamp=created_at, + properties={"$lib": lib}, + ) + + +class TestOrganizationUsageReport(factory_org_usage_report(create_person, create_event_clickhouse, send_all_org_usage_reports, {"EE_AVAILABLE": True, "USE_TZ": False, "PRIMARY_DB": AnalyticsDBMS.CLICKHOUSE})): # type: ignore + pass diff --git a/ee/tasks/test/test_send_license_usage.py b/ee/tasks/test/test_send_license_usage.py new file mode 100644 index 0000000000000..55da08b9414f6 --- /dev/null +++ b/ee/tasks/test/test_send_license_usage.py @@ -0,0 +1,89 @@ +from unittest.mock import patch +from uuid import uuid4 + +import posthoganalytics +from freezegun import freeze_time + +from ee.api.test.base import LicensedTestMixin +from ee.clickhouse.models.event import create_event +from ee.clickhouse.util import ClickhouseDestroyTablesMixin +from ee.models.license import License +from ee.tasks.send_license_usage import send_license_usage +from posthog.models import organization +from posthog.models.team import Team +from posthog.test.base import APIBaseTest + + +def _create_event(**kwargs): + kwargs.update({"event_uuid": uuid4()}) + create_event(**kwargs) + + +class SendLicenseUsageTest(LicensedTestMixin, ClickhouseDestroyTablesMixin, APIBaseTest): + @freeze_time("2021-10-10T23:01:00Z") + @patch("posthoganalytics.capture") + @patch("requests.post") + def test_send_license_usage(self, mock_post, mock_capture): + team2 = Team.objects.create(organization=self.organization) + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-08T14:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T12:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T13:01:01Z") + _create_event( + event="$$internal_metrics_shouldnt_be_billed", + team=self.team, + distinct_id=1, + timestamp="2021-10-09T13:01:01Z", + ) + _create_event(event="$pageview", team=team2, distinct_id=1, timestamp="2021-10-09T14:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-10T14:01:01Z") + + send_license_usage() + mock_post.assert_called_once_with( + "https://license.posthog.com/licenses/usage", + data={"date": "2021-10-09", "key": self.license.key, "events_count": 3}, + ) + mock_capture.assert_called_once_with( + self.user.distinct_id, + "send license usage data", + {"date": "2021-10-09", "events_count": 3, "license_keys": ["enterprise"], "organization_name": "Test"}, + ) + + @freeze_time("2021-10-10T23:01:00Z") + @patch("posthoganalytics.capture") + @patch("ee.tasks.send_license_usage.sync_execute", side_effect=Exception()) + def test_send_license_error(self, mock_post, mock_capture): + team2 = Team.objects.create(organization=self.organization) + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-08T14:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T12:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T13:01:01Z") + _create_event( + event="$$internal_metrics_shouldnt_be_billed", + team=self.team, + distinct_id=1, + timestamp="2021-10-09T13:01:01Z", + ) + _create_event(event="$pageview", team=team2, distinct_id=1, timestamp="2021-10-09T14:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-10T14:01:01Z") + + send_license_usage() + mock_capture.assert_called_once_with( + self.user.distinct_id, + "send license usage data error", + {"error": "", "date": "2021-10-09", "organization_name": "Test"}, + ) + + +class SendLicenseUsageNoLicenseTest(APIBaseTest): + @freeze_time("2021-10-10T23:01:00Z") + @patch("requests.post") + def test_no_license(self, mock_post): + # Same test, we just don't include the LicensedTestMixin so no license + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-08T14:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T12:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T13:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-09T14:01:01Z") + _create_event(event="$pageview", team=self.team, distinct_id=1, timestamp="2021-10-10T14:01:01Z") + + send_license_usage() + + self.assertEqual(mock_post.call_count, 0) diff --git a/ee/tasks/test/test_status_report.py b/ee/tasks/test/test_status_report.py new file mode 100644 index 0000000000000..10e7b00914eaf --- /dev/null +++ b/ee/tasks/test/test_status_report.py @@ -0,0 +1,89 @@ +from datetime import datetime +from uuid import uuid4 + +from freezegun.api import freeze_time + +from ee.clickhouse.client import sync_execute +from ee.clickhouse.models.event import create_event +from ee.clickhouse.models.person import create_person_distinct_id +from posthog.models.person import Person +from posthog.models.team import Team +from posthog.models.utils import UUIDT +from posthog.tasks.status_report import status_report +from posthog.tasks.test.test_status_report import factory_status_report + + +def _create_event(distinct_id: str, event: str, lib: str, created_at: datetime, team: Team): + create_event( + event_uuid=uuid4(), + event=event, + distinct_id=distinct_id, + timestamp=created_at, + team=team, + properties={"$lib": lib}, + ) + + +def _create_person(distinct_id: str, team: Team) -> Person: + person = Person.objects.create(team=team, distinct_ids=[distinct_id]) + return Person(id=person.uuid) + + +class TestStatusReport(factory_status_report(_create_event, _create_person)): # type: ignore + # CH only + def test_status_report_duplicate_distinct_ids(self) -> None: + create_person_distinct_id(self.team.id, "duplicate_id1", str(UUIDT())) + create_person_distinct_id(self.team.id, "duplicate_id1", str(UUIDT())) + create_person_distinct_id(self.team.id, "duplicate_id2", str(UUIDT())) + create_person_distinct_id(self.team.id, "duplicate_id2", str(UUIDT())) + create_person_distinct_id(self.team.id, "duplicate_id2", str(UUIDT())) + + for index in range(0, 2): + sync_execute( + "INSERT INTO person_distinct_id SELECT %(distinct_id)s, %(person_id)s, %(team_id)s, 1, %(timestamp)s, 0 VALUES", + { + "distinct_id": "duplicate_id_old", + "person_id": str(UUIDT()), + "team_id": self.team.id, + "timestamp": "2020-01-01 12:01:0%s" % index, + }, + ) + + report = status_report(dry_run=True).get("teams")[self.team.id] # type: ignore + + duplicate_ids_report = report["duplicate_distinct_ids"] + + expected_result = { + "prev_total_ids_with_duplicates": 1, + "prev_total_extra_distinct_id_rows": 1, + "new_total_ids_with_duplicates": 2, + "new_total_extra_distinct_id_rows": 4, + } + + self.assertEqual(duplicate_ids_report, expected_result) + + # CH only + def test_status_report_multiple_ids_per_person(self) -> None: + person_id1 = str(UUIDT()) + person_id2 = str(UUIDT()) + + create_person_distinct_id(self.team.id, "id1", person_id1) + create_person_distinct_id(self.team.id, "id2", person_id1) + create_person_distinct_id(self.team.id, "id3", person_id1) + create_person_distinct_id(self.team.id, "id4", person_id1) + create_person_distinct_id(self.team.id, "id5", person_id1) + + create_person_distinct_id(self.team.id, "id6", person_id2) + create_person_distinct_id(self.team.id, "id7", person_id2) + create_person_distinct_id(self.team.id, "id8", person_id2) + + report = status_report(dry_run=True).get("teams")[self.team.id] # type: ignore + + multiple_ids_report = report["multiple_ids_per_person"] + + expected_result = { + "total_persons_with_more_than_2_ids": 2, + "max_distinct_ids_for_one_person": 5, + } + + self.assertEqual(multiple_ids_report, expected_result) diff --git a/ee/tasks/test/test_webhooks_ee.py b/ee/tasks/test/test_webhooks_ee.py deleted file mode 100644 index 8d7f1acb9c508..0000000000000 --- a/ee/tasks/test/test_webhooks_ee.py +++ /dev/null @@ -1,46 +0,0 @@ -from unittest.mock import call, patch - -import pytz -from django.utils.timezone import now - -from ee.tasks.webhooks_ee import post_event_to_webhook_ee -from posthog.api.test.base import BaseTest -from posthog.models.action import Action -from posthog.models.action_step import ActionStep -from posthog.models.event import Event - - -def _create_action(**kwargs): - team = kwargs.pop("team") - name = kwargs.pop("name") - post_to_slack = kwargs.pop("post_to_slack") - action = Action.objects.create(team=team, name=name, post_to_slack=post_to_slack) - ActionStep.objects.create(action=action, event=name) - return action - - -class TestWebhooksEE(BaseTest): - @patch("requests.post") - def test_post_event_to_webhook_ee(self, requests_post): - - self.team.slack_incoming_webhook = "http://slack.com/hook" - self.team.save() - _create_action(team=self.team, name="user paid", post_to_slack=True) - _create_action(team=self.team, name="user not paid", post_to_slack=True) - - _now = now() - - event = { - "event": "user paid", - "properties": {}, - "distinct_id": "test", - "timestamp": _now, - "elements_list": {}, - } - site_url = "http://testserver" - post_event_to_webhook_ee(event, self.team.pk, site_url) - self.assertEqual(requests_post.call_count, 1) - - events = Event.objects.filter(event="User paid") - - self.assertEqual(list(events), []) diff --git a/ee/tasks/webhooks_ee.py b/ee/tasks/webhooks_ee.py deleted file mode 100644 index 803757d86c30f..0000000000000 --- a/ee/tasks/webhooks_ee.py +++ /dev/null @@ -1,51 +0,0 @@ -import re -from typing import Any, Dict - -import requests -from celery import Task -from django.conf import settings - -from posthog.celery import app -from posthog.models import Action, Event, Team -from posthog.tasks.webhooks import determine_webhook_type, get_formatted_message - - -@app.task(ignore_result=True, bind=True, max_retries=3) -def post_event_to_webhook_ee(self: Task, event: Dict[str, Any], team_id: int, site_url: str) -> None: - try: - team = Team.objects.get(pk=team_id) - _event = Event.objects.create( - event=event["event"], - distinct_id=event["distinct_id"], - properties=event["properties"], - team=team, - site_url=site_url, - **({"timestamp": event["timestamp"]} if event["timestamp"] else {}), - **({"elements": event["elements_list"]} if event["elements_list"] else {}) - ) - - actions = Action.objects.filter(team_id=team_id, post_to_slack=True).all() - - if not site_url: - site_url = settings.SITE_URL - - if team.slack_incoming_webhook: - for action in actions: - qs = Event.objects.filter(pk=_event.pk).query_db_by_action(action) - if qs: - message_text, message_markdown = get_formatted_message(action, _event, site_url,) - if determine_webhook_type(team) == "slack": - message = { - "text": message_text, - "blocks": [{"type": "section", "text": {"type": "mrkdwn", "text": message_markdown},},], - } - else: - message = { - "text": message_markdown, - } - requests.post(team.slack_incoming_webhook, verify=False, json=message) - - _event.delete() - - except: - self.retry(countdown=2 ** self.request.retries) diff --git a/ee/urls.py b/ee/urls.py index c9f8f67877045..50ffb05df8823 100644 --- a/ee/urls.py +++ b/ee/urls.py @@ -1,8 +1,22 @@ -from posthog.api import DefaultRouterPlusPlus +from typing import Any, List -from .api import hooks, license +from django.urls.conf import path +from rest_framework_extensions.routers import NestedRegistryItem +from posthog.api.routing import DefaultRouterPlusPlus -def extend_api_router(router: DefaultRouterPlusPlus): - router.register(r"license", license.LicenseViewSet) - router.register(r"hooks", hooks.HookViewSet, basename="hooks") +from .api import authentication, debug_ch_queries, explicit_team_member, hooks, license + + +def extend_api_router(root_router: DefaultRouterPlusPlus, *, projects_router: NestedRegistryItem): + root_router.register(r"license", license.LicenseViewSet) + root_router.register(r"debug_ch_queries", debug_ch_queries.DebugCHQueries, "debug_ch_queries") + projects_router.register(r"hooks", hooks.HookViewSet, "project_hooks", ["team_id"]) + projects_router.register( + r"explicit_members", explicit_team_member.ExplicitTeamMemberViewSet, "project_explicit_members", ["team_id"] + ) + + +urlpatterns: List[Any] = [ + path("api/saml/metadata/", authentication.saml_metadata_view), +] diff --git a/frontend/build.mjs b/frontend/build.mjs new file mode 100755 index 0000000000000..3811e3e23ed72 --- /dev/null +++ b/frontend/build.mjs @@ -0,0 +1,59 @@ +#!/usr/bin/env node +import * as path from 'path' +import { __dirname, copyIndexHtml, copyPublicFolder, buildOrWatch, isDev, startServer } from './utils.mjs' + +function writeIndexHtml(chunks = {}) { + copyIndexHtml('src/index.html', 'dist/index.html', 'index', chunks) + copyIndexHtml('src/layout.html', 'dist/layout.html', 'index', chunks) + copyIndexHtml('src/shared_dashboard.html', 'dist/shared_dashboard.html', 'shared_dashboard', chunks) +} + +let pauseServer = () => {} +let resumeServer = () => {} +if (isDev) { + console.log(`👀 Starting dev server`) + const serverResponse = startServer() + pauseServer = serverResponse.pauseServer + resumeServer = serverResponse.resumeServer +} else { + console.log(`🛳 Starting production build`) +} + +function onBuildComplete(chunks) { + resumeServer() + writeIndexHtml(chunks) +} + +copyPublicFolder() +writeIndexHtml({}) + +await Promise.all([ + buildOrWatch({ + name: 'PostHog App', + entryPoints: ['src/index.tsx'], + bundle: true, + splitting: true, + format: 'esm', + outdir: path.resolve(__dirname, 'dist'), + onBuildStart: pauseServer, + onBuildComplete: onBuildComplete, + }), + buildOrWatch({ + name: 'Shared Dashboard', + entryPoints: ['src/scenes/dashboard/SharedDashboard.tsx'], + bundle: true, + format: 'iife', + outfile: path.resolve(__dirname, 'dist', 'shared_dashboard.js'), + onBuildStart: pauseServer, + onBuildComplete: onBuildComplete, + }), + buildOrWatch({ + name: 'Toolbar', + entryPoints: ['src/toolbar/index.tsx'], + bundle: true, + format: 'iife', + outfile: path.resolve(__dirname, 'dist', 'toolbar.js'), + onBuildStart: pauseServer, + onBuildComplete: onBuildComplete, + }), +]) diff --git a/frontend/public/Inter.woff b/frontend/public/Inter.woff new file mode 100644 index 0000000000000..2fc4f3d2fddf0 Binary files /dev/null and b/frontend/public/Inter.woff differ diff --git a/frontend/public/Inter.woff2 b/frontend/public/Inter.woff2 new file mode 100644 index 0000000000000..21d5159a61c85 Binary files /dev/null and b/frontend/public/Inter.woff2 differ diff --git a/frontend/public/dashboard-empty-state.svg b/frontend/public/dashboard-empty-state.svg new file mode 100644 index 0000000000000..38dd01d569496 --- /dev/null +++ b/frontend/public/dashboard-empty-state.svg @@ -0,0 +1,118 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/frontend/public/email/invite-hero.png b/frontend/public/email/invite-hero.png deleted file mode 100644 index 501daaec82f5c..0000000000000 Binary files a/frontend/public/email/invite-hero.png and /dev/null differ diff --git a/frontend/public/favicon-192x192.png b/frontend/public/favicon-192x192.png deleted file mode 100644 index 3f65f14a0e61c..0000000000000 Binary files a/frontend/public/favicon-192x192.png and /dev/null differ diff --git a/frontend/public/favicon-32x32.png b/frontend/public/favicon-32x32.png deleted file mode 100644 index 6a236dffd1bd2..0000000000000 Binary files a/frontend/public/favicon-32x32.png and /dev/null differ diff --git a/frontend/public/hedgehog-bridge-page.png b/frontend/public/hedgehog-bridge-page.png new file mode 100644 index 0000000000000..9e9a5d4e2716e Binary files /dev/null and b/frontend/public/hedgehog-bridge-page.png differ diff --git a/frontend/public/icons/android-chrome-192x192.png b/frontend/public/icons/android-chrome-192x192.png new file mode 100644 index 0000000000000..79c2aaf5f8583 Binary files /dev/null and b/frontend/public/icons/android-chrome-192x192.png differ diff --git a/frontend/public/icons/android-chrome-512x512.png b/frontend/public/icons/android-chrome-512x512.png new file mode 100644 index 0000000000000..0d6ecc3207c51 Binary files /dev/null and b/frontend/public/icons/android-chrome-512x512.png differ diff --git a/frontend/public/icons/apple-touch-icon.png b/frontend/public/icons/apple-touch-icon.png new file mode 100644 index 0000000000000..c61e43a6ff348 Binary files /dev/null and b/frontend/public/icons/apple-touch-icon.png differ diff --git a/frontend/public/icons/favicon-16x16.png b/frontend/public/icons/favicon-16x16.png new file mode 100644 index 0000000000000..106716f177baf Binary files /dev/null and b/frontend/public/icons/favicon-16x16.png differ diff --git a/frontend/public/icons/favicon-32x32.png b/frontend/public/icons/favicon-32x32.png new file mode 100644 index 0000000000000..9c04b329f08d8 Binary files /dev/null and b/frontend/public/icons/favicon-32x32.png differ diff --git a/frontend/public/icons/favicon.ico b/frontend/public/icons/favicon.ico new file mode 100644 index 0000000000000..30be52bd29992 Binary files /dev/null and b/frontend/public/icons/favicon.ico differ diff --git a/frontend/public/icons/safari-pinned-tab.svg b/frontend/public/icons/safari-pinned-tab.svg new file mode 100644 index 0000000000000..16e3311de37a2 --- /dev/null +++ b/frontend/public/icons/safari-pinned-tab.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frontend/public/mountains.svg b/frontend/public/mountains.svg new file mode 100644 index 0000000000000..56d58044d311e --- /dev/null +++ b/frontend/public/mountains.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/frontend/public/not-found-rocket.svg b/frontend/public/not-found-rocket.svg new file mode 100644 index 0000000000000..0613cc1b8d366 --- /dev/null +++ b/frontend/public/not-found-rocket.svg @@ -0,0 +1,1115 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/frontend/public/posthog-logo-cloud.svg b/frontend/public/posthog-logo-cloud.svg new file mode 100644 index 0000000000000..8ee711f12549b --- /dev/null +++ b/frontend/public/posthog-logo-cloud.svg @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/frontend/public/posthog-logo-selfhosted.svg b/frontend/public/posthog-logo-selfhosted.svg new file mode 100644 index 0000000000000..55683cd4259c9 --- /dev/null +++ b/frontend/public/posthog-logo-selfhosted.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/frontend/public/posthog-logo.png b/frontend/public/posthog-logo.png index 4b2035c5d166b..a108b0b8a3217 100644 Binary files a/frontend/public/posthog-logo.png and b/frontend/public/posthog-logo.png differ diff --git a/frontend/public/site.webmanifest b/frontend/public/site.webmanifest new file mode 100644 index 0000000000000..a5885050b452c --- /dev/null +++ b/frontend/public/site.webmanifest @@ -0,0 +1,18 @@ +{ + "name": "PostHog", + "short_name": "PostHog", + "icons": [ + { + "src": "/static/icons/android-chrome-192x192.png?v=2021-04-28", + "sizes": "192x192", + "type": "image/png" + }, + { + "src": "/static/icons/android-chrome-512x512.png?v=2021-04-28", + "sizes": "512x512", + "type": "image/png" + } + ], + "theme_color": "#ffffff", + "background_color": "#ffffff" +} diff --git a/frontend/src/Storybook.stories.mdx b/frontend/src/Storybook.stories.mdx new file mode 100644 index 0000000000000..c02baef8ca9e4 --- /dev/null +++ b/frontend/src/Storybook.stories.mdx @@ -0,0 +1,47 @@ +import { Meta } from '@storybook/addon-docs'; + + + +# The PostHog Storybook + +We're using Storybook to: +- track the difference between designed and implemented product decisions. +- make it easy to work on hard-to-reach scenes (e.g. preflight check) +- provide permalinks with which to link to stories in issues + +## Running locally + +To run storybook locally, run `yarn storybook`. It'll open on [http://localhost:6006/](http://localhost:6006/) + +## How to create stories + +To capture a scene as a story, run `getReduxState()` inside Chrome's JS console (only works in [dev mode](http://localhost:8000/)) and save the +returned state into a `.json` file. Then follow the same pattern other `*.stories.tsx` files use: + +```tsx +// Events.stories.tsx +import { Meta } from '@storybook/react' +import { keaStory } from 'lib/storybook/kea-story' + +// import the main component of the scene +import { Events } from '../Events' + +// import the `getReduxState()` output for all the variations you wish to show +import eventsState from './events.json' + +// some metadata and optional parameters +export default { + title: 'PostHog/Scenes/Events', +} as Meta + +// export more stories with different state +export const AllEvents = keaStory(Events, eventsState) + + +``` + +## Fun story: PostHog Onboarding Story + +For a fun story, check out [The Self-Hosted Onboarding Flow](/story/posthog-onboarding-1-preflight--initial), where a user +has to go through three different designs before reaching the app they came for! + diff --git a/frontend/src/antd.less b/frontend/src/antd.less deleted file mode 100644 index 33a9f97a104bf..0000000000000 --- a/frontend/src/antd.less +++ /dev/null @@ -1,34 +0,0 @@ -/* This file sets theming configuration on Ant Design for PostHog. Ant uses LESS which is incompatible -with SASS, which is why configuration is duplicated. To change any variable here, please update vars.scss too */ -@import 'antd/lib/style/themes/default.less'; -@import 'antd/dist/antd.less'; - -@text-color: #2d2d2d; -@text-muted: #d9d9d9; -@primary-color: #5375ff; -@link-color: #5375ff; -@success-color: #77b96c; -@warning-color: #f7a501; -@error-color: #f96132; -@font-size-base: 14px; -@heading-color: @text-color; -@text-color-secondary: rgba(0, 0, 0, 0.45); -@disabled-color: @text-muted; -@border-radius-base: 2px; -@border-color-base: #d9d9d9; -@body-background:  #f2f2f2; -@layout-body-background: #fff; - -.hide-gte-lg { - display: none !important; - @media (max-width: @screen-lg-min) { - display: inherit !important; - } -} - -.hide-lte-lg { - display: none !important; - @media (min-width: @screen-lg-min) { - display: inherit !important; - } -} diff --git a/frontend/src/custom.d.ts b/frontend/src/custom.d.ts new file mode 100644 index 0000000000000..cb017b415bf53 --- /dev/null +++ b/frontend/src/custom.d.ts @@ -0,0 +1,17 @@ +// This fixes TS errors when importing a .svg file +declare module '*.svg' { + const content: any + export default content +} + +// This fixes TS errors when importing a .png file +declare module '*.png' { + const content: any + export default content +} + +// This fixes TS errors when importing an .mp3 file +declare module '*.mp3' { + const content: any + export default content +} diff --git a/frontend/src/global.scss b/frontend/src/global.scss deleted file mode 100644 index 05e0710e04eab..0000000000000 --- a/frontend/src/global.scss +++ /dev/null @@ -1,369 +0,0 @@ -/* Only styles that are shared across multiple components (i.e. global) should go here, trying to keep this file -nimble to simplify maintenance. We separate variables and mixins in vars.scss to be able to import those into local -style files without adding already imported styles. */ - -// Global components -@import 'node_modules/react-toastify/dist/ReactToastify'; -@import './vars'; - -:root { - --primary: #{$primary}; - --success: #{$success}; - --danger: #{$danger}; - --warning: #{$warning}; - --bg-menu: #{$bg_menu}; - --bg-mid: #{$bg_mid}; - --muted: #{$text_muted}; - // Used for graph series - --blue: #{$blue_500}; - --purple: #{purple_500}; - --salmon: #ff906e; - --yellow: #ffc035; - --green: #{$success}; - --indigo: #{$purple_700}; - --cyan: #17a2b8; - --pink: #e83e8c; - --white: #f4f6ff; -} - -// Text styles -.text-default { - color: $text_default; - font-size: 14px; - line-height: 20px; - font-weight: 400; -} - -.text-small { - @extend .text-default; - font-size: 12px; -} - -.text-extra-small { - @extend .text-default; - font-size: 10px; -} - -.page-title { - font-size: 28px; - line-height: 34px; - margin-top: 32px; - font-weight: 700; - color: $text_default; -} - -.page-caption { - @extend .text-default; - max-width: 640px; - margin-bottom: 32px; -} - -.subtitle { - margin-top: 24px; - font-size: 22px; - line-height: 26px; - font-weight: 700; -} - -.l3 { - /* Level 3 title (ideally H3) */ - font-size: 16px; - font-weight: 700; - line-height: 19px; -} - -.text-right { - text-align: right; -} - -.text-left { - text-align: left; -} - -.text-center { - text-align: center; -} - -.text-muted { - color: $text_muted; -} - -// Spacing & layout -.mb { - margin-bottom: $default_spacing; -} - -.mt { - margin-top: $default_spacing; -} - -.mb-05 { - margin-bottom: $default_spacing * 0.5; -} - -.mt-05 { - margin-top: $default_spacing * 0.5; -} - -.mr { - margin-right: $default_spacing; -} - -.ml { - margin-left: $default_spacing; -} - -.mr-05 { - margin-right: $default_spacing * 0.5; -} - -.ml-05 { - margin-left: $default_spacing * 0.5; -} - -.pa { - // Padding all - padding: $default_spacing; -} - -.pb { - padding-bottom: $default_spacing; -} - -.pt { - padding-top: $default_spacing; -} - -.pr { - padding-right: $default_spacing; -} - -.pl { - padding-left: $default_spacing; -} - -.full-width { - width: 100%; -} - -.float-right { - float: right; -} - -.float-left { - float: left; -} - -.main-app-content { - /* top | horizontal | bottom */ - padding: 0px $default_spacing * 3 $default_spacing * 3; - - @media (min-width: 480px) and (max-width: 639px) { - padding: $default_spacing $default_spacing * 2 !important; - } - - @media (max-width: 480px) { - padding: $default_spacing $default_spacing !important; - } -} - -// Color styles -.bg-primary { - background-color: $primary; -} - -.text-success { - color: $success !important; -} - -.text-danger { - color: $danger !important; -} - -.text-warning { - color: $warning !important; -} - -// Random general styles -.cursor-pointer { - cursor: pointer; -} - -// Toasts -.Toastify__toast-container { - opacity: 1; - transform: none; -} - -.Toastify__toast { - padding: 16px; - border-radius: $radius; - color: $text_default; - font-family: inherit; - background-color: $bg_light; -} - -.Toastify__toast-body { - @extend .l3; - color: $success; - p { - @extend .text-default; - color: $text_default; - } -} - -.Toastify__progress-bar--default { - background: $success; -} - -.Toastify__toast--error { - h1 { - color: $danger; - } - .Toastify__progress-bar { - background: $danger; - } - .error-details { - font-style: italic; - } -} - -// Table styles -.table-bordered td { - border: 1px solid $border; -} - -// Card styles -.card-elevated { - @extend .mixin-elevated; -} - -// Form & input styles -.input-set { - padding-bottom: $default_spacing; - color: $text_default; - - label { - font-weight: bold; - @extend .text-default; - } - - .caption { - color: $text_muted; - @extend .text-small; - } - - &.errored { - .caption { - color: $danger; - } - .ant-input-password, - input[type='text'] { - border-color: $danger !important; - } - } -} - -// Button styles -.btn-close { - color: $text_muted; -} - -.ant-btn-sm { - font-size: 12px !important; -} - -.ant-btn-md { - // Size between `small` & `default` - font-size: 13px !important; - height: 28px !important; - padding: 0px 10px !important; -} - -.info-indicator { - color: $primary !important; - cursor: pointer; - margin-left: 5px; -} - -// Badges styles -.badge { - border-radius: 50%; - width: 22px; - height: 22px; - background-color: $border_dark; - display: flex; - align-items: center; - justify-content: center; - color: white; - font-size: 12px; -} - -// Overlays styles -#bottom-notice { - z-index: 1000000000; - display: flex; - flex-direction: row; - position: fixed; - width: 100%; - bottom: 0; - left: 0; - background: #000; - color: #fff; - font-size: 0.75rem; - line-height: 1.5rem; - code { - color: inherit; - font-size: inherit; - } - &.warning div { - height: auto; - background: $danger; - } - &.tricolor { - div:nth-child(1) { - background: $brand_blue; - } - div:nth-child(2) { - background: $brand_red; - } - div:nth-child(3) { - background: $brand_yellow; - } - } - div { - flex-basis: 0; - flex-grow: 1; - height: 1.5rem; - text-align: center; - } - span { - display: none; - } - button { - border: none; - background: transparent; - color: inherit; - width: 1.5rem; - height: 1.5rem; - padding: 0; - font-size: 1rem; - font-weight: bold; - cursor: pointer; - } - @media screen and (min-width: 750px) { - font-size: 1rem; - line-height: 2rem; - div { - height: 2rem; - } - span { - display: inline; - } - button { - width: 2rem; - height: 2rem; - font-size: 1.25rem; - } - } -} diff --git a/frontend/src/globals.d.ts b/frontend/src/globals.d.ts index c049ce16b57f0..b043954b92ee6 100644 --- a/frontend/src/globals.d.ts +++ b/frontend/src/globals.d.ts @@ -4,6 +4,11 @@ declare global { interface Window { JS_POSTHOG_API_KEY?: str JS_POSTHOG_HOST?: str + JS_POSTHOG_SELF_CAPTURE?: boolean + JS_CAPTURE_INTERNAL_METRICS?: boolean posthog?: posthog + ESBUILD_LOAD_SCRIPT: (name) => void + ESBUILD_LOAD_CHUNKS: (name) => void + ESBUILD_LOADED_CHUNKS: Set } } diff --git a/frontend/src/index.html b/frontend/src/index.html index aaaa61e7bb5a2..3f66b59629f87 100644 --- a/frontend/src/index.html +++ b/frontend/src/index.html @@ -1,12 +1,16 @@ - + {% include "head.html" %} {% include "overlays.html" %}
+ {% include "live_server_inject.html" %} diff --git a/frontend/src/index.tsx b/frontend/src/index.tsx index 98e0b30705e49..5f851da879acb 100644 --- a/frontend/src/index.tsx +++ b/frontend/src/index.tsx @@ -1,6 +1,5 @@ -import '~/global.scss' /* Contains PostHog's main styling configurations */ -import '~/antd.less' /* Imports Ant Design's components */ -import './style.scss' /* DEPRECATED */ +import '~/styles' + import React from 'react' import ReactDOM from 'react-dom' import { Provider } from 'react-redux' @@ -14,6 +13,16 @@ import { loadPostHogJS } from './loadPostHogJS' loadPostHogJS() initKea() +// Expose `window.getReduxState()` to make snapshots to storybook easy +if (typeof window !== 'undefined') { + // Disabled in production to prevent leaking secret data, personal API keys, etc + if (process.env.NODE_ENV === 'development') { + ;(window as any).getReduxState = () => getContext().store.getState() + } else { + ;(window as any).getReduxState = () => 'Disabled outside development!' + } +} + ReactDOM.render( diff --git a/frontend/src/initKea.ts b/frontend/src/initKea.ts new file mode 100644 index 0000000000000..799381d89b6c0 --- /dev/null +++ b/frontend/src/initKea.ts @@ -0,0 +1,74 @@ +import { KeaPlugin, resetContext } from 'kea' +import { localStoragePlugin } from 'kea-localstorage' +import { routerPlugin } from 'kea-router' +import { loadersPlugin } from 'kea-loaders' +import { windowValuesPlugin } from 'kea-window-values' +import { errorToast, identifierToHuman } from 'lib/utils' +import { waitForPlugin } from 'kea-waitfor' +import dayjs from 'dayjs' +import LocalizedFormat from 'dayjs/plugin/localizedFormat' + +/* +Actions for which we don't want to show error alerts, +mostly to avoid user confusion. +*/ +const ERROR_FILTER_WHITELIST = [ + 'loadPreflight', // Gracefully handled if it fails + 'loadUser', // App won't load (unless loading from shared dashboards) + 'loadFunnels', // Special error handling on insights + 'loadResults', // Special error handling on insights + 'authenticate', // Special error handling on login + 'signup', // Special error handling on login + 'loadLatestVersion', + 'loadBilling', // Gracefully handled if it fails +] + +interface InitKeaProps { + state?: Record + routerHistory?: any + routerLocation?: any + beforePlugins?: KeaPlugin[] +} + +export function initKea({ state, routerHistory, routerLocation, beforePlugins }: InitKeaProps = {}): void { + // necessary for any localised date formatting to work + // doesn't matter if it is called multiple times but must be called once + dayjs.extend(LocalizedFormat) + + resetContext({ + plugins: [ + ...(beforePlugins || []), + localStoragePlugin, + windowValuesPlugin({ window: window }), + routerPlugin({ history: routerHistory, location: routerLocation }), + loadersPlugin({ + onFailure({ error, reducerKey, actionKey }: { error: any; reducerKey: string; actionKey: string }) { + // Toast if it's a fetch error or a specific API update error + if ( + !ERROR_FILTER_WHITELIST.includes(actionKey) && + (error?.message === 'Failed to fetch' || // Likely CORS headers errors (i.e. request failing without reaching Django) + (error?.status !== undefined && ![200, 201, 204].includes(error.status))) + ) { + errorToast( + `Error on ${identifierToHuman(reducerKey)}`, + `Attempting to ${identifierToHuman(actionKey).toLowerCase()} returned an error:`, + error.status !== 0 + ? error.detail + : "Check your internet connection and make sure you don't have an extension blocking our requests.", + error.code + ) + } + console.error(error) + ;(window as any).Sentry?.captureException(error) + }, + }), + waitForPlugin, + ], + defaults: state, + createStore: state + ? { + preloadedState: state, + } + : true, + }) +} diff --git a/frontend/src/initKea.tsx b/frontend/src/initKea.tsx deleted file mode 100644 index 20b597430abab..0000000000000 --- a/frontend/src/initKea.tsx +++ /dev/null @@ -1,32 +0,0 @@ -import { resetContext } from 'kea' -import localStoragePlugin from 'kea-localstorage' -import { routerPlugin } from 'kea-router' -import { loadersPlugin } from 'kea-loaders' -import { windowValuesPlugin } from 'kea-window-values' -import { toast } from 'react-toastify' -import React from 'react' -import { identifierToHuman } from 'lib/utils' - -export function initKea(): void { - resetContext({ - plugins: [ - localStoragePlugin, - windowValuesPlugin({ window: window }), - routerPlugin, - loadersPlugin({ - onFailure({ error, reducerKey, actionKey }) { - toast.error( -
-

Error on {identifierToHuman(reducerKey)}.

-

- Attempting to {identifierToHuman(actionKey, false)} returned an error:{' '} - {error.detail} -

-
- ) - window['Sentry'] ? window['Sentry'].captureException(error) : console.error(error) - }, - }), - ], - }) -} diff --git a/frontend/src/layout.ejs b/frontend/src/layout.ejs index 772bfe17cb7af..e206d3139e569 100644 --- a/frontend/src/layout.ejs +++ b/frontend/src/layout.ejs @@ -1,116 +1,121 @@ <%/* DJANGO login/signup form layout! */%> - - - - - PostHog - {% include "head.html" %} - <%= htmlWebpackPlugin.tags.headTags %><%/* This adds the main.css file! */%> - - - {% block head %} - {% endblock %} - - + .field input { + font-size: 1.5em; + border: 0; + border-bottom: 1px solid #ccc; + font-family: inherit; + -webkit-appearance: none; + border-radius: 0; + padding: 0; + cursor: text; + } - {% include "overlays.html" %} - {% block content %} - {% endblock %} + .field input:focus { + outline: 0; + border-bottom: 1px solid #666; + } - + input:placeholder-shown + label { + cursor: text; + max-width: 66.66%; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + transform-origin: left bottom; + transform: translate(0, 2.125rem) scale(1.5); + } + ::-webkit-input-placeholder { + opacity: 0; + transition: inherit; + } + input:focus::-webkit-input-placeholder { + opacity: 1; + } + input:not(:placeholder-shown) + label, + input:focus + label { + transform: translate(0, 0) scale(1); + cursor: pointer; + color: var(--primary); + } + + {% block head %} {% endblock %} + + + {% include "overlays.html" %} {% block content %} {% endblock %} + diff --git a/frontend/src/layout.html b/frontend/src/layout.html new file mode 100644 index 0000000000000..55be6b0757a87 --- /dev/null +++ b/frontend/src/layout.html @@ -0,0 +1,119 @@ + + + + + PostHog + {% include "head.html" %} + + + + + + {% block head %} {% endblock %} + + + {% include "overlays.html" %} {% block content %} {% endblock %} + + diff --git a/frontend/src/layout/ChangelogModal.tsx b/frontend/src/layout/ChangelogModal.tsx index 3f90b73fb4f68..080a945458490 100644 --- a/frontend/src/layout/ChangelogModal.tsx +++ b/frontend/src/layout/ChangelogModal.tsx @@ -1,36 +1,42 @@ import React from 'react' import { Button, Modal } from 'antd' import { useValues } from 'kea' -import { useLatestVersion } from 'lib/hooks/useLatestVersion' -import { userLogic } from 'scenes/userLogic' +import { navigationLogic } from './navigation/navigationLogic' +import { preflightLogic } from 'scenes/PreflightCheck/logic' -export function ChangelogModal({ onDismiss }: { onDismiss: () => void }): JSX.Element { - const { user } = useValues(userLogic) - const latestVersion = useLatestVersion(user?.posthog_version) +export interface ChangelogModalProps { + onDismiss: () => void + visible?: boolean +} + +export function ChangelogModal({ onDismiss, visible }: ChangelogModalProps): JSX.Element | null { + const { preflight } = useValues(preflightLogic) + const { latestVersion } = useValues(navigationLogic) + + if (preflight?.cloud) { + // The changelog is not available on cloud + return null + } return ( Close} style={{ top: 20, minWidth: '70%', fontSize: 16 }} > - {!window.location.href.includes('app.posthog.com') ? ( - - You're on version {user?.posthog_version} of PostHog.{' '} - {latestVersion && - (latestVersion === user?.posthog_version ? ( - 'This is the newest version.' - ) : ( - <> - The newest version is {latestVersion}. - - ))} - - ) : ( - You're on the newest version of PostHog. - )} + + You're on version {preflight?.posthog_version} of PostHog.{' '} + {latestVersion && + (latestVersion === preflight?.posthog_version ? ( + 'This is the newest version.' + ) : ( + + The newest version is {latestVersion}. + + ))} +