diff --git a/README.pdf b/README.pdf index 286883f..8af8054 100644 Binary files a/README.pdf and b/README.pdf differ diff --git a/RELEASE_NOTES.html b/RELEASE_NOTES.html index 409a3f9..069de39 100644 --- a/RELEASE_NOTES.html +++ b/RELEASE_NOTES.html @@ -228,6 +228,8 @@
1823 def get_table_columns_from_format(self, database: str = None) -> list: -1824 """ -1825 The function `get_table_columns_from_format` returns a list of table columns based on the -1826 specified database format. -1827 -1828 :param database: The `database` parameter is a string that represents the name of the database. -1829 It is an optional parameter, which means it has a default value of `None`. If no value is -1830 provided for the `database` parameter, the `get_database()` method is called to retrieve the -1831 current database name -1832 :type database: str -1833 :return: a list of table columns. -1834 """ -1835 -1836 table_columns = None +@@ -9412,81 +9446,81 @@1833 def get_table_columns_from_format(self, database: str = None) -> list: +1834 """ +1835 The function `get_table_columns_from_format` returns a list of table columns based on the +1836 specified database format. 1837 -1838 if not database: -1839 database = self.get_database() -1840 -1841 database_format = self.get_format(database) -1842 -1843 needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None) -1844 if needed_columns: -1845 table_columns = list(needed_columns.keys()) -1846 else: -1847 table_columns = [] -1848 -1849 return table_columns +1838 :param database: The `database` parameter is a string that represents the name of the database. +1839 It is an optional parameter, which means it has a default value of `None`. If no value is +1840 provided for the `database` parameter, the `get_database()` method is called to retrieve the +1841 current database name +1842 :type database: str +1843 :return: a list of table columns. +1844 """ +1845 +1846 table_columns = None +1847 +1848 if not database: +1849 database = self.get_database() +1850 +1851 database_format = self.get_format(database) +1852 +1853 needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None) +1854 if needed_columns: +1855 table_columns = list(needed_columns.keys()) +1856 else: +1857 table_columns = [] +1858 +1859 return table_columnsReturns
1851 def get_table_columns_from_file( -1852 self, -1853 database: str = None, -1854 header_file: str = None, -1855 header_file_find: bool = True, -1856 ) -> list: -1857 """ -1858 The function `get_table_columns_from_file` retrieves the column names from a database or header -1859 file. -1860 -1861 :param database: The `database` parameter is a string that represents the name or path of the -1862 database file. If this parameter is not provided, the `get_database()` method is called to -1863 retrieve the database name or path -1864 :type database: str -1865 :param header_file: The `header_file` parameter is a string that represents the file path or -1866 name of the header file. This file contains the header information for a table, which typically -1867 includes the names of the columns in the table -1868 :type header_file: str -1869 :param header_file_find: Allow header file find if not provided -1870 :type header_file_find: bool -1871 :return: a list of table columns. -1872 """ -1873 -1874 table_columns = None -1875 -1876 if not database: -1877 database = self.get_database() -1878 -1879 if not header_file and header_file_find: -1880 header_file = self.get_header_file() -1881 -1882 if not header_file and header_file_find: -1883 header_file = self.find_header_file(database) -1884 -1885 database_format = self.get_format(database=database) -1886 delimiter = SEP_TYPE.get(database_format, "\t") -1887 -1888 # Try from database file -1889 try: -1890 table_header = self.read_header_file(database) -1891 except ValueError: -1892 table_header = None -1893 -1894 if table_header: -1895 try: -1896 table_columns = ( -1897 table_header[self.get_header_length(header_file=table_header)] -1898 .strip() -1899 .split(delimiter) -1900 ) -1901 except IndexError: -1902 table_columns = None -1903 else: -1904 table_columns = None -1905 -1906 if not table_columns: -1907 # Try from header file -1908 try: -1909 table_header = self.read_header_file(header_file) -1910 except ValueError: -1911 table_header = None -1912 -1913 if table_header: -1914 try: -1915 table_columns = ( -1916 table_header[self.get_header_length(header_file=table_header)] -1917 .strip() -1918 .split(delimiter) -1919 ) -1920 except IndexError: -1921 table_columns = None -1922 else: -1923 table_columns = None -1924 -1925 return table_columns +@@ -9525,27 +9559,27 @@1861 def get_table_columns_from_file( +1862 self, +1863 database: str = None, +1864 header_file: str = None, +1865 header_file_find: bool = True, +1866 ) -> list: +1867 """ +1868 The function `get_table_columns_from_file` retrieves the column names from a database or header +1869 file. +1870 +1871 :param database: The `database` parameter is a string that represents the name or path of the +1872 database file. If this parameter is not provided, the `get_database()` method is called to +1873 retrieve the database name or path +1874 :type database: str +1875 :param header_file: The `header_file` parameter is a string that represents the file path or +1876 name of the header file. This file contains the header information for a table, which typically +1877 includes the names of the columns in the table +1878 :type header_file: str +1879 :param header_file_find: Allow header file find if not provided +1880 :type header_file_find: bool +1881 :return: a list of table columns. +1882 """ +1883 +1884 table_columns = None +1885 +1886 if not database: +1887 database = self.get_database() +1888 +1889 if not header_file and header_file_find: +1890 header_file = self.get_header_file() +1891 +1892 if not header_file and header_file_find: +1893 header_file = self.find_header_file(database) +1894 +1895 database_format = self.get_format(database=database) +1896 delimiter = SEP_TYPE.get(database_format, "\t") +1897 +1898 # Try from database file +1899 try: +1900 table_header = self.read_header_file(database) +1901 except ValueError: +1902 table_header = None +1903 +1904 if table_header: +1905 try: +1906 table_columns = ( +1907 table_header[self.get_header_length(header_file=table_header)] +1908 .strip() +1909 .split(delimiter) +1910 ) +1911 except IndexError: +1912 table_columns = None +1913 else: +1914 table_columns = None +1915 +1916 if not table_columns: +1917 # Try from header file +1918 try: +1919 table_header = self.read_header_file(header_file) +1920 except ValueError: +1921 table_header = None +1922 +1923 if table_header: +1924 try: +1925 table_columns = ( +1926 table_header[self.get_header_length(header_file=table_header)] +1927 .strip() +1928 .split(delimiter) +1929 ) +1930 except IndexError: +1931 table_columns = None +1932 else: +1933 table_columns = None +1934 +1935 return table_columnsReturns
1927 def get_annotations(self, database: str = None) -> object: -1928 """ -1929 This function returns the annotations of a database or the default database if none is -1930 specified. -1931 -1932 :param database: The parameter `database` is a string that represents the name of the database -1933 to retrieve annotations from. If no database name is provided, the method will use the default -1934 database name obtained from the `get_database()` method -1935 :type database: str -1936 :return: The function `get_annotations` returns the `infos` attribute of the header of a -1937 database. If the `database` parameter is not provided, it gets the current database using the -1938 `get_database` method. If there is no header, it returns `None`. -1939 """ -1940 -1941 if not database: -1942 database = self.get_database() -1943 -1944 if self.get_header(database=database): -1945 return self.get_header(database=database).infos -1946 else: -1947 return None +@@ -9582,54 +9616,54 @@1937 def get_annotations(self, database: str = None) -> object: +1938 """ +1939 This function returns the annotations of a database or the default database if none is +1940 specified. +1941 +1942 :param database: The parameter `database` is a string that represents the name of the database +1943 to retrieve annotations from. If no database name is provided, the method will use the default +1944 database name obtained from the `get_database()` method +1945 :type database: str +1946 :return: The function `get_annotations` returns the `infos` attribute of the header of a +1947 database. If the `database` parameter is not provided, it gets the current database using the +1948 `get_database` method. If there is no header, it returns `None`. +1949 """ +1950 +1951 if not database: +1952 database = self.get_database() +1953 +1954 if self.get_header(database=database): +1955 return self.get_header(database=database).infos +1956 else: +1957 return NoneReturns
1949 def get_extra_columns( -1950 self, database: str = None, database_type: str = None, sql_query: str = None -1951 ) -> list: -1952 """ -1953 This Python function returns a list of extra columns in a database table that are not needed -1954 based on the database type and existing columns. -1955 -1956 :param database: A string representing the name of the database to retrieve columns from. If -1957 None is provided, the default database will be used -1958 :type database: str -1959 :param database_type: The `database_type` parameter in the `get_extra_columns` function -1960 represents the type of the database for which you want to retrieve the list of extra columns. It -1961 is used to determine which columns are needed based on the database type and the existing -1962 columns in the specified database table -1963 :type database_type: str -1964 :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass -1965 an SQL query that can be used to retrieve specific columns from the database. This query can be -1966 customized to filter columns based on certain conditions or criteria before analyzing them to -1967 determine the extra columns that are not needed -1968 :type sql_query: str -1969 :return: A list of extra columns in a database table that are not needed based on the database -1970 type and existing columns. -1971 """ -1972 -1973 if not database: -1974 database = self.get_database() -1975 -1976 if not database: -1977 return [] -1978 -1979 existing_columns = self.get_columns( -1980 database=database, -1981 table=self.get_database_table(database), -1982 sql_query=sql_query, -1983 ) -1984 if not database_type: -1985 database_type = self.get_type(database=database, sql_query=sql_query) -1986 needed_columns = self.get_needed_columns( -1987 database_columns=existing_columns, database_type=database_type -1988 ) -1989 -1990 extra_columns = existing_columns.copy() -1991 -1992 for needed_col in needed_columns: -1993 if needed_columns.get(needed_col) in extra_columns: -1994 extra_columns.remove(needed_columns.get(needed_col)) -1995 -1996 return extra_columns +@@ -9672,40 +9706,40 @@1959 def get_extra_columns( +1960 self, database: str = None, database_type: str = None, sql_query: str = None +1961 ) -> list: +1962 """ +1963 This Python function returns a list of extra columns in a database table that are not needed +1964 based on the database type and existing columns. +1965 +1966 :param database: A string representing the name of the database to retrieve columns from. If +1967 None is provided, the default database will be used +1968 :type database: str +1969 :param database_type: The `database_type` parameter in the `get_extra_columns` function +1970 represents the type of the database for which you want to retrieve the list of extra columns. It +1971 is used to determine which columns are needed based on the database type and the existing +1972 columns in the specified database table +1973 :type database_type: str +1974 :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass +1975 an SQL query that can be used to retrieve specific columns from the database. This query can be +1976 customized to filter columns based on certain conditions or criteria before analyzing them to +1977 determine the extra columns that are not needed +1978 :type sql_query: str +1979 :return: A list of extra columns in a database table that are not needed based on the database +1980 type and existing columns. +1981 """ +1982 +1983 if not database: +1984 database = self.get_database() +1985 +1986 if not database: +1987 return [] +1988 +1989 existing_columns = self.get_columns( +1990 database=database, +1991 table=self.get_database_table(database), +1992 sql_query=sql_query, +1993 ) +1994 if not database_type: +1995 database_type = self.get_type(database=database, sql_query=sql_query) +1996 needed_columns = self.get_needed_columns( +1997 database_columns=existing_columns, database_type=database_type +1998 ) +1999 +2000 extra_columns = existing_columns.copy() +2001 +2002 for needed_col in needed_columns: +2003 if needed_columns.get(needed_col) in extra_columns: +2004 extra_columns.remove(needed_columns.get(needed_col)) +2005 +2006 return extra_columnsReturns
1998 def is_vcf(self, database: str = None, sql_query: str = None) -> bool: -1999 """ -2000 The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and -2001 their types. -2002 -2003 :param database: The `database` parameter in the `is_vcf` function is a string that represents -2004 the name of the database that the function will use to check if the file is a VCF (Variant Call -2005 Format) file. If the `database` parameter is not provided when calling the function, it will -2006 :type database: str -2007 :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL -2008 query string that can be used to filter the columns retrieved from the database. This query can -2009 be used to narrow down the columns that are considered when checking if the database is of type -2010 "vcf" -2011 :type sql_query: str -2012 :return: The function `is_vcf` returns a boolean value indicating whether the database type is -2013 "vcf" or not. -2014 """ -2015 -2016 if not database: -2017 database = self.get_database() -2018 -2019 if not database: -2020 return False -2021 -2022 database_columns = self.get_columns( -2023 database=database, -2024 table=self.get_database_table(database), -2025 sql_query=sql_query, -2026 ) -2027 -2028 # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples) -2029 return self.get_type_from_columns( -2030 database_columns=database_columns, check_database_type="vcf" -2031 ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8) +@@ -9745,13 +9779,13 @@2008 def is_vcf(self, database: str = None, sql_query: str = None) -> bool: +2009 """ +2010 The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and +2011 their types. +2012 +2013 :param database: The `database` parameter in the `is_vcf` function is a string that represents +2014 the name of the database that the function will use to check if the file is a VCF (Variant Call +2015 Format) file. If the `database` parameter is not provided when calling the function, it will +2016 :type database: str +2017 :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL +2018 query string that can be used to filter the columns retrieved from the database. This query can +2019 be used to narrow down the columns that are considered when checking if the database is of type +2020 "vcf" +2021 :type sql_query: str +2022 :return: The function `is_vcf` returns a boolean value indicating whether the database type is +2023 "vcf" or not. +2024 """ +2025 +2026 if not database: +2027 database = self.get_database() +2028 +2029 if not database: +2030 return False +2031 +2032 database_columns = self.get_columns( +2033 database=database, +2034 table=self.get_database_table(database), +2035 sql_query=sql_query, +2036 ) +2037 +2038 # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples) +2039 return self.get_type_from_columns( +2040 database_columns=database_columns, check_database_type="vcf" +2041 ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8)Returns
2033 def get_conn(self): -2034 """ -2035 The function returns the connection object. -2036 :return: The method is returning the value of the instance variable `self.conn`. -2037 """ -2038 -2039 return self.conn +@@ -9777,79 +9811,79 @@2043 def get_conn(self): +2044 """ +2045 The function returns the connection object. +2046 :return: The method is returning the value of the instance variable `self.conn`. +2047 """ +2048 +2049 return self.connReturns
2041 def is_genotype_column( -2042 self, -2043 column: str, -2044 database: str = None, -2045 downsampling: int = 1000, -2046 check_format: bool = True, -2047 ) -> bool: -2048 """ -2049 The `is_genotype_column` function in Python checks if a specified column in a database contains -2050 genotype data based on a regular expression pattern. -2051 -2052 :param column: The `column` parameter is a string that represents the name of a column in a -2053 database table. It is used to specify the column for which you want to check if it contains -2054 genotype information based on a regular expression pattern -2055 :type column: str -2056 :param database: The `database` parameter in the `is_genotype_column` method is used to specify -2057 the name of the database from which the data will be queried. If a database is provided, the -2058 method will query the specified database to check if the given column contains genotype -2059 information. If no database is provided, -2060 :type database: str -2061 :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an -2062 integer value that determines the number of rows to be sampled from the database table when -2063 checking for genotype information in the specified column. This parameter is used to limit the -2064 number of rows to be processed in order to improve performance, defaults to 1000 -2065 :type downsampling: int (optional) -2066 :param check_format: The `check_format` parameter in the `is_genotype_column` method is a -2067 boolean flag that determines whether the function should check the format of the data before -2068 proceeding with the genotype column analysis. If `check_format` is set to `True`, the function -2069 will verify if the specified column exists in, defaults to True -2070 :type check_format: bool (optional) -2071 :return: The `is_genotype_column` method returns a boolean value. If the specified column in a -2072 database table contains genotype information, it returns `True`; otherwise, it returns `False`. -2073 """ -2074 -2075 # Table variants -2076 table_variants_from = self.get_sql_database_link(database=database) -2077 -2078 # Check if format column is present -2079 if check_format: -2080 query = f""" -2081 SELECT * FROM {table_variants_from} -2082 LIMIT 0 -2083 """ -2084 df = self.query(query=query) -2085 if "FORMAT" not in df.columns or column not in df.columns: -2086 return False -2087 query_format = f""" -2088 AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$')) -2089 """ -2090 else: -2091 query_format = "" -2092 -2093 # Query number of samples -2094 query_downsampling = f""" -2095 SELECT "{column}", FORMAT -2096 FROM {table_variants_from} -2097 LIMIT {downsampling} -2098 """ -2099 df_downsampling = self.query(query=query_downsampling) -2100 -2101 # Query to check genotype -2102 query_genotype = f""" -2103 SELECT * -2104 FROM df_downsampling -2105 WHERE ( -2106 regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*') -2107 {query_format} -2108 ) -2109 """ -2110 df_genotype = self.query(query=query_genotype) -2111 -2112 # return -2113 return len(df_genotype) == len(df_downsampling) +@@ -9897,807 +9931,809 @@2051 def is_genotype_column( +2052 self, +2053 column: str, +2054 database: str = None, +2055 downsampling: int = 1000, +2056 check_format: bool = True, +2057 ) -> bool: +2058 """ +2059 The `is_genotype_column` function in Python checks if a specified column in a database contains +2060 genotype data based on a regular expression pattern. +2061 +2062 :param column: The `column` parameter is a string that represents the name of a column in a +2063 database table. It is used to specify the column for which you want to check if it contains +2064 genotype information based on a regular expression pattern +2065 :type column: str +2066 :param database: The `database` parameter in the `is_genotype_column` method is used to specify +2067 the name of the database from which the data will be queried. If a database is provided, the +2068 method will query the specified database to check if the given column contains genotype +2069 information. If no database is provided, +2070 :type database: str +2071 :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an +2072 integer value that determines the number of rows to be sampled from the database table when +2073 checking for genotype information in the specified column. This parameter is used to limit the +2074 number of rows to be processed in order to improve performance, defaults to 1000 +2075 :type downsampling: int (optional) +2076 :param check_format: The `check_format` parameter in the `is_genotype_column` method is a +2077 boolean flag that determines whether the function should check the format of the data before +2078 proceeding with the genotype column analysis. If `check_format` is set to `True`, the function +2079 will verify if the specified column exists in, defaults to True +2080 :type check_format: bool (optional) +2081 :return: The `is_genotype_column` method returns a boolean value. If the specified column in a +2082 database table contains genotype information, it returns `True`; otherwise, it returns `False`. +2083 """ +2084 +2085 # Table variants +2086 table_variants_from = self.get_sql_database_link(database=database) +2087 +2088 # Check if format column is present +2089 if check_format: +2090 query = f""" +2091 SELECT * FROM {table_variants_from} +2092 LIMIT 0 +2093 """ +2094 df = self.query(query=query) +2095 if "FORMAT" not in df.columns or column not in df.columns: +2096 return False +2097 query_format = f""" +2098 AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$')) +2099 """ +2100 else: +2101 query_format = "" +2102 +2103 # Query number of samples +2104 query_downsampling = f""" +2105 SELECT "{column}", FORMAT +2106 FROM {table_variants_from} +2107 LIMIT {downsampling} +2108 """ +2109 df_downsampling = self.query(query=query_downsampling) +2110 +2111 # Query to check genotype +2112 query_genotype = f""" +2113 SELECT * +2114 FROM df_downsampling +2115 WHERE ( +2116 regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*') +2117 {query_format} +2118 ) +2119 """ +2120 df_genotype = self.query(query=query_genotype) +2121 +2122 # return +2123 return len(df_genotype) == len(df_downsampling)Returns
2115 def export( -2116 self, -2117 output_database: str, -2118 output_header: str = None, -2119 header_in_output: bool = True, -2120 database: str = None, -2121 table: str = "variants", -2122 parquet_partitions: list = None, -2123 threads: int = 1, -2124 sort: bool = False, -2125 index: bool = False, -2126 existing_columns_header: list = [], -2127 order_by: str = "", -2128 query: str = None, -2129 compression_type: str = None, -2130 chunk_size: int = 1000000, -2131 export_mode: str = "pyarrow", -2132 compresslevel: int = 6, -2133 export_header: bool = True, -2134 sample_list: list = None, -2135 ) -> bool: -2136 """ -2137 The `export` function exports data from a database to a specified output format, compresses it -2138 if necessary, and returns a boolean value indicating whether the export was successful or not. -2139 -2140 :param output_database: The `output_database` parameter is a string that represents the path and -2141 filename of the output file to be exported. It specifies where the exported data will be saved -2142 :type output_database: str -2143 :param output_header: The `output_header` parameter is an optional string that represents the -2144 header of the output file. If provided, it specifies the header that will be included in the -2145 output file. If not provided, the header will be automatically detected based on the output file -2146 format -2147 :type output_header: str -2148 :param header_in_output: The `header_in_output` parameter is a boolean value that determines -2149 whether the header should be included in the output file. If set to `True`, the header will be -2150 included in the output file. If set to `False`, the header will not be included in the output -2151 file. By default,, defaults to True -2152 :type header_in_output: bool (optional) -2153 :param database: The `database` parameter is the name of the database from which you want to -2154 export data. If this parameter is not provided, the function will use the `get_database()` -2155 method to retrieve the current database -2156 :type database: str -2157 :param table: The `table` parameter specifies the name of the table in the database from which -2158 the data will be exported. By default, if not specified, it is set to "variants", defaults to -2159 variants -2160 :type table: str (optional) -2161 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the -2162 partition columns for the Parquet output format. Each element in the list represents a partition -2163 column. The partitions are used to organize the data in the Parquet file based on the values of -2164 the specified columns -2165 :type parquet_partitions: list -2166 :param threads: The `threads` parameter in the `export` function is an optional integer that -2167 specifies the number of threads to use for exporting the data. It determines the level of -2168 parallelism during the export process. By default, it is set to 1, defaults to 1 -2169 :type threads: int (optional) -2170 :param sort: The `sort` parameter in the `export` function is a boolean value that specifies -2171 whether the output file should be sorted based on the genomic coordinates of the variants. If -2172 `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults -2173 to False -2174 :type sort: bool (optional) -2175 :param index: The `index` parameter is a boolean value that specifies whether to index the -2176 output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to -2177 `False` or not provided, the output file will not be indexed. By default,, defaults to False -2178 :type index: bool (optional) -2179 :param existing_columns_header: The `existing_columns_header` parameter is a list that -2180 represents the existing columns in the header of the output file. It is used to determine the -2181 columns that should be included in the output file. If this parameter is not provided, the -2182 function will automatically detect the header columns based on the output file format -2183 :type existing_columns_header: list -2184 :param order_by: The `order_by` parameter in the `export` function is a string that specifies -2185 the columns by which the output file should be ordered. You can specify multiple columns -2186 separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC" -2187 (descending) to specify -2188 :type order_by: str -2189 :param query: The `query` parameter in the `export` function represents a SQL query that -2190 specifies the data to be exported from the database. If provided, the function will export the -2191 result of this query. If the `query` parameter is not provided, the function will generate a -2192 query to export the data from -2193 :type query: str -2194 :param compression_type: The `compression_type` parameter in the `export` function specifies the -2195 type of compression to be applied to the output file. By default, the compression type is set to -2196 "bgzip". This parameter allows you to choose the compression algorithm for the output file, such -2197 as "gzip", "bgzip -2198 :type compression_type: str -2199 :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of -2200 each chunk or batch of data that will be processed during the export operation. It determines -2201 how many records or lines of data will be included in each chunk that is processed at a time, -2202 defaults to 1000000 -2203 :type chunk_size: int (optional) -2204 :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of -2205 export, which can be either "pyarrow" or "duckdb", defaults to pyarrow -2206 :type export_mode: str (optional) -2207 :param compresslevel: The `compresslevel` parameter in the `export` function represents the -2208 level of compression for gzip. By default, it is set to 6. This parameter allows you to specify -2209 the compression level when using gzip compression for the output file. The compression level can -2210 range from 0 (no compression), defaults to 6 -2211 :type compresslevel: int (optional) -2212 :param export_header: The `export_header` parameter is a boolean flag that determines whether -2213 the header of a VCF file should be exported to a separate file or not. If `export_header` is -2214 True, the header will be exported to a file. If `export_header` is False, the header will not -2215 be, defaults to True -2216 :type export_header: bool (optional) -2217 :param sample_list: The `sample_list` parameter in the `export` function is a list that -2218 specifies the samples to be included in the exported data. If provided, the samples listed in -2219 this parameter will be included in the output file. If not provided, the function will determine -2220 the samples to include based on the data -2221 :type sample_list: list -2222 :return: The `export` function returns a boolean value indicating whether the export was -2223 successful or not. -2224 """ -2225 -2226 # Full path -2227 output_database = full_path(output_database) -2228 output_header = full_path(output_header) -2229 database = full_path(database) -2230 -2231 # Database -2232 if not database: -2233 database = self.get_database() -2234 if not database: -2235 return False -2236 -2237 # Chunk size -2238 if not chunk_size: -2239 chunk_size = 1000000 -2240 else: -2241 chunk_size = int(chunk_size) -2242 -2243 # Export mode -2244 # Either "pyarrow" (default) or "duckdb" -2245 if not export_mode: -2246 export_mode = "pyarrow" -2247 -2248 # Compression level -2249 if not compresslevel: -2250 compresslevel = 6 -2251 -2252 # Remove output if exists -2253 remove_if_exists(output_database) -2254 -2255 # Tmp -2256 tmp_folder = os.path.dirname(output_database) -2257 if not tmp_folder: -2258 tmp_folder = "." -2259 -2260 with TemporaryDirectory( -2261 dir=tmp_folder, prefix="howard_database_export_" -2262 ) as tmp_dir: -2263 -2264 # tmp files -2265 tmp_files = [] -2266 -2267 # query_set -2268 query_set = "" +diff --git a/docs/pdoc/howard/objects/variants.html b/docs/pdoc/howard/objects/variants.html index 83d5367..713f453 100644 --- a/docs/pdoc/howard/objects/variants.html +++ b/docs/pdoc/howard/objects/variants.html @@ -6629,7 +6629,7 @@2125 def export( +2126 self, +2127 output_database: str, +2128 output_header: str = None, +2129 header_in_output: bool = True, +2130 database: str = None, +2131 table: str = "variants", +2132 parquet_partitions: list = None, +2133 threads: int = 1, +2134 sort: bool = False, +2135 index: bool = False, +2136 existing_columns_header: list = [], +2137 order_by: str = "", +2138 query: str = None, +2139 compression_type: str = None, +2140 chunk_size: int = 1000000, +2141 export_mode: str = "pyarrow", +2142 compresslevel: int = 6, +2143 export_header: bool = True, +2144 sample_list: list = None, +2145 ) -> bool: +2146 """ +2147 The `export` function exports data from a database to a specified output format, compresses it +2148 if necessary, and returns a boolean value indicating whether the export was successful or not. +2149 +2150 :param output_database: The `output_database` parameter is a string that represents the path and +2151 filename of the output file to be exported. It specifies where the exported data will be saved +2152 :type output_database: str +2153 :param output_header: The `output_header` parameter is an optional string that represents the +2154 header of the output file. If provided, it specifies the header that will be included in the +2155 output file. If not provided, the header will be automatically detected based on the output file +2156 format +2157 :type output_header: str +2158 :param header_in_output: The `header_in_output` parameter is a boolean value that determines +2159 whether the header should be included in the output file. If set to `True`, the header will be +2160 included in the output file. If set to `False`, the header will not be included in the output +2161 file. By default,, defaults to True +2162 :type header_in_output: bool (optional) +2163 :param database: The `database` parameter is the name of the database from which you want to +2164 export data. If this parameter is not provided, the function will use the `get_database()` +2165 method to retrieve the current database +2166 :type database: str +2167 :param table: The `table` parameter specifies the name of the table in the database from which +2168 the data will be exported. By default, if not specified, it is set to "variants", defaults to +2169 variants +2170 :type table: str (optional) +2171 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the +2172 partition columns for the Parquet output format. Each element in the list represents a partition +2173 column. The partitions are used to organize the data in the Parquet file based on the values of +2174 the specified columns +2175 :type parquet_partitions: list +2176 :param threads: The `threads` parameter in the `export` function is an optional integer that +2177 specifies the number of threads to use for exporting the data. It determines the level of +2178 parallelism during the export process. By default, it is set to 1, defaults to 1 +2179 :type threads: int (optional) +2180 :param sort: The `sort` parameter in the `export` function is a boolean value that specifies +2181 whether the output file should be sorted based on the genomic coordinates of the variants. If +2182 `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults +2183 to False +2184 :type sort: bool (optional) +2185 :param index: The `index` parameter is a boolean value that specifies whether to index the +2186 output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to +2187 `False` or not provided, the output file will not be indexed. By default,, defaults to False +2188 :type index: bool (optional) +2189 :param existing_columns_header: The `existing_columns_header` parameter is a list that +2190 represents the existing columns in the header of the output file. It is used to determine the +2191 columns that should be included in the output file. If this parameter is not provided, the +2192 function will automatically detect the header columns based on the output file format +2193 :type existing_columns_header: list +2194 :param order_by: The `order_by` parameter in the `export` function is a string that specifies +2195 the columns by which the output file should be ordered. You can specify multiple columns +2196 separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC" +2197 (descending) to specify +2198 :type order_by: str +2199 :param query: The `query` parameter in the `export` function represents a SQL query that +2200 specifies the data to be exported from the database. If provided, the function will export the +2201 result of this query. If the `query` parameter is not provided, the function will generate a +2202 query to export the data from +2203 :type query: str +2204 :param compression_type: The `compression_type` parameter in the `export` function specifies the +2205 type of compression to be applied to the output file. By default, the compression type is set to +2206 "bgzip". This parameter allows you to choose the compression algorithm for the output file, such +2207 as "gzip", "bgzip +2208 :type compression_type: str +2209 :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of +2210 each chunk or batch of data that will be processed during the export operation. It determines +2211 how many records or lines of data will be included in each chunk that is processed at a time, +2212 defaults to 1000000 +2213 :type chunk_size: int (optional) +2214 :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of +2215 export, which can be either "pyarrow" or "duckdb", defaults to pyarrow +2216 :type export_mode: str (optional) +2217 :param compresslevel: The `compresslevel` parameter in the `export` function represents the +2218 level of compression for gzip. By default, it is set to 6. This parameter allows you to specify +2219 the compression level when using gzip compression for the output file. The compression level can +2220 range from 0 (no compression), defaults to 6 +2221 :type compresslevel: int (optional) +2222 :param export_header: The `export_header` parameter is a boolean flag that determines whether +2223 the header of a VCF file should be exported to a separate file or not. If `export_header` is +2224 True, the header will be exported to a file. If `export_header` is False, the header will not +2225 be, defaults to True +2226 :type export_header: bool (optional) +2227 :param sample_list: The `sample_list` parameter in the `export` function is a list that +2228 specifies the samples to be included in the exported data. If provided, the samples listed in +2229 this parameter will be included in the output file. If not provided, the function will determine +2230 the samples to include based on the data +2231 :type sample_list: list +2232 :return: The `export` function returns a boolean value indicating whether the export was +2233 successful or not. +2234 """ +2235 +2236 # Full path +2237 output_database = full_path(output_database) +2238 output_header = full_path(output_header) +2239 database = full_path(database) +2240 +2241 # Database +2242 if not database: +2243 database = self.get_database() +2244 if not database: +2245 return False +2246 +2247 # Chunk size +2248 if not chunk_size: +2249 chunk_size = 1000000 +2250 else: +2251 chunk_size = int(chunk_size) +2252 +2253 # Export mode +2254 # Either "pyarrow" (default) or "duckdb" +2255 if not export_mode: +2256 export_mode = "pyarrow" +2257 +2258 # Compression level +2259 if not compresslevel: +2260 compresslevel = 6 +2261 +2262 # Remove output if exists +2263 remove_if_exists(output_database) +2264 +2265 # Tmp +2266 tmp_folder = os.path.dirname(output_database) +2267 if not tmp_folder: +2268 tmp_folder = "." 2269 -2270 # Header columns -2271 if not existing_columns_header and output_header: -2272 existing_columns_header = self.get_header_file_columns(output_header) +2270 with TemporaryDirectory( +2271 dir=tmp_folder, prefix="howard_database_export_" +2272 ) as tmp_dir: 2273 -2274 # Auto-detect output type and compression and delimiter -2275 output_type = get_file_format(output_database) -2276 compressed = self.is_compressed(database=output_database) -2277 delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t") -2278 -2279 # database type -2280 if output_type in ["vcf"]: -2281 database_type = "vcf" -2282 elif output_type in ["bed"]: -2283 database_type = "regions" -2284 else: -2285 database_type = self.get_type(database=database, sql_query=query) -2286 -2287 # database object -2288 # If database is string, then create database conn -2289 if isinstance(database, str): -2290 database_conn = Database(database=database).get_conn() -2291 else: -2292 database_conn = database -2293 -2294 # Existing columns -2295 existing_columns = self.get_columns( -2296 database=database_conn, -2297 table=self.get_database_table(database=database), -2298 sql_query=query, -2299 ) -2300 -2301 # Extra columns -2302 extra_columns = self.get_extra_columns( -2303 database=database_conn, database_type=output_type, sql_query=query -2304 ) -2305 -2306 # Needed columns -2307 needed_columns = self.get_needed_columns( -2308 database_columns=existing_columns, database_type=database_type +2274 # tmp files +2275 tmp_files = [] +2276 +2277 # query_set +2278 query_set = "" +2279 +2280 # Header columns +2281 if not existing_columns_header and output_header: +2282 existing_columns_header = self.get_header_file_columns(output_header) +2283 +2284 # Auto-detect output type and compression and delimiter +2285 output_type = get_file_format(output_database) +2286 compressed = self.is_compressed(database=output_database) +2287 delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t") +2288 +2289 # database type +2290 if output_type in ["vcf"]: +2291 database_type = "vcf" +2292 elif output_type in ["bed"]: +2293 database_type = "regions" +2294 else: +2295 database_type = self.get_type(database=database, sql_query=query) +2296 +2297 # database object +2298 # If database is string, then create database conn +2299 if isinstance(database, str): +2300 database_conn = Database(database=database).get_conn() +2301 else: +2302 database_conn = database +2303 +2304 # Existing columns +2305 existing_columns = self.get_columns( +2306 database=database_conn, +2307 table=self.get_database_table(database=database), +2308 sql_query=query, 2309 ) 2310 -2311 # Order by -2312 order_by_list = [] -2313 if order_by: -2314 # Split order by options -2315 order_by_split = order_by.split(",") -2316 for order_by_option in order_by_split: -2317 # Split order by option -2318 order_by_option_split = order_by_option.strip().split(" ") -2319 order_by_option_split_column = order_by_option_split[0] -2320 if len(order_by_option_split) > 1: -2321 order_by_option_split_order = order_by_option_split[1] -2322 else: -2323 order_by_option_split_order = "ASC" -2324 # Chek if column exists -2325 if ( -2326 order_by_option_split_column.replace('"', "").strip() -2327 in existing_columns -2328 ): -2329 order_by_list.append( -2330 f"{order_by_option_split_column} {order_by_option_split_order}" -2331 ) -2332 -2333 # Clean order by -2334 order_by_clean = ", ".join(order_by_list) -2335 -2336 # Query values -2337 default_empty_value = "" -2338 query_export_format = None -2339 include_header = False -2340 post_process = False -2341 order_by_sql = "" -2342 post_process_just_move = False -2343 -2344 # export options -2345 export_options = {} -2346 -2347 # VCF -2348 if output_type in ["vcf"]: -2349 if not self.is_vcf(database=database_conn, sql_query=query): -2350 extra_columns = [] -2351 else: -2352 extra_columns = existing_columns_header +2311 # Extra columns +2312 extra_columns = self.get_extra_columns( +2313 database=database_conn, database_type=output_type, sql_query=query +2314 ) +2315 +2316 # Needed columns +2317 needed_columns = self.get_needed_columns( +2318 database_columns=existing_columns, database_type=database_type +2319 ) +2320 +2321 # Order by +2322 order_by_list = [] +2323 if order_by: +2324 # Split order by options +2325 order_by_split = order_by.split(",") +2326 for order_by_option in order_by_split: +2327 # Split order by option +2328 order_by_option_split = order_by_option.strip().split(" ") +2329 order_by_option_split_column = order_by_option_split[0] +2330 if len(order_by_option_split) > 1: +2331 order_by_option_split_order = order_by_option_split[1] +2332 else: +2333 order_by_option_split_order = "ASC" +2334 # Chek if column exists +2335 if ( +2336 order_by_option_split_column.replace('"', "").strip() +2337 in existing_columns +2338 ): +2339 order_by_list.append( +2340 f"{order_by_option_split_column} {order_by_option_split_order}" +2341 ) +2342 +2343 # Clean order by +2344 order_by_clean = ", ".join(order_by_list) +2345 +2346 # Query values +2347 default_empty_value = "" +2348 query_export_format = None +2349 include_header = False +2350 post_process = False +2351 order_by_sql = "" +2352 post_process_just_move = False 2353 -2354 # Check VCF format with extra columns -2355 extra_columns_clean = [] +2354 # export options +2355 export_options = {} 2356 -2357 # Force samples list in parameter -2358 if sample_list: -2359 if "FORMAT" in extra_columns: -2360 extra_columns = ["FORMAT"] + sample_list -2361 else: -2362 extra_columns = sample_list +2357 # VCF +2358 if output_type in ["vcf"]: +2359 if not self.is_vcf(database=database_conn, sql_query=query): +2360 extra_columns = [] +2361 else: +2362 extra_columns = existing_columns_header 2363 -2364 # Check columns -2365 else: -2366 for extra_column in extra_columns: -2367 if extra_column not in needed_columns and ( -2368 extra_column == "FORMAT" -2369 or ( -2370 "FORMAT" in extra_columns_clean -2371 and self.is_genotype_column( -2372 database=database, column=extra_column -2373 ) -2374 ) -2375 ): -2376 extra_columns_clean.append(extra_column) -2377 extra_columns = extra_columns_clean -2378 -2379 default_empty_value = "." -2380 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'" -2381 include_header = True -2382 post_process = True -2383 # Export options -2384 if not compression_type: -2385 compression_type = "bgzip" -2386 export_options = { -2387 "format": "CSV", -2388 "delimiter": delimiter, -2389 "header": True, -2390 "quote": None, -2391 "compression": compression_type, -2392 } -2393 compresslevel = 1 -2394 -2395 # TSV/CSV/TBL -2396 elif output_type in ["tsv", "csv", "tbl"]: -2397 if output_type in ["csv", "tbl"]: -2398 quote = '"' -2399 else: -2400 quote = "" -2401 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'" -2402 if delimiter in ["\t"]: -2403 include_header = header_in_output and True -2404 post_process = True -2405 if order_by_clean: -2406 order_by_sql = f"ORDER BY {order_by_clean}" -2407 # Export options -2408 if not compression_type: -2409 compression_type = "gzip" -2410 export_options = { -2411 "format": "CSV", -2412 "delimiter": delimiter, -2413 "header": True, -2414 "quote": quote, -2415 "compression": compression_type, -2416 } -2417 -2418 # JSON -2419 elif output_type in ["json"]: -2420 query_export_format = "FORMAT JSON, ARRAY TRUE" -2421 include_header = False -2422 post_process = True -2423 if order_by_clean: -2424 order_by_sql = f"ORDER BY {order_by_clean}" -2425 # Export options -2426 if not compression_type: -2427 compression_type = "gzip" -2428 export_options = { -2429 "format": "JSON", -2430 "array": True, -2431 "compression": compression_type, -2432 } -2433 -2434 # Parquet -2435 elif output_type in ["parquet"]: -2436 query_export_format = "FORMAT PARQUET" -2437 # Export options +2364 # Check VCF format with extra columns +2365 extra_columns_clean = [] +2366 +2367 # Force samples list in parameter +2368 if sample_list: +2369 if "FORMAT" in extra_columns: +2370 extra_columns = ["FORMAT"] + sample_list +2371 else: +2372 extra_columns = sample_list +2373 +2374 # Check columns +2375 else: +2376 for extra_column in extra_columns: +2377 if extra_column not in needed_columns and ( +2378 extra_column == "FORMAT" +2379 or ( +2380 "FORMAT" in extra_columns_clean +2381 and self.is_genotype_column( +2382 database=database, column=extra_column +2383 ) +2384 ) +2385 ): +2386 extra_columns_clean.append(extra_column) +2387 extra_columns = extra_columns_clean +2388 +2389 default_empty_value = "." +2390 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'" +2391 include_header = True +2392 post_process = True +2393 # Export options +2394 if not compression_type: +2395 compression_type = "bgzip" +2396 export_options = { +2397 "format": "CSV", +2398 "delimiter": delimiter, +2399 "header": True, +2400 "quote": None, +2401 "compression": compression_type, +2402 } +2403 compresslevel = 1 +2404 +2405 # TSV/CSV/TBL +2406 elif output_type in ["tsv", "csv", "tbl"]: +2407 if output_type in ["csv", "tbl"]: +2408 quote = '"' +2409 else: +2410 quote = "" +2411 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'" +2412 if delimiter in ["\t"]: +2413 include_header = header_in_output and True +2414 post_process = True +2415 if order_by_clean: +2416 order_by_sql = f"ORDER BY {order_by_clean}" +2417 # Export options +2418 if not compression_type: +2419 compression_type = "gzip" +2420 export_options = { +2421 "format": "CSV", +2422 "delimiter": delimiter, +2423 "header": True, +2424 "quote": quote, +2425 "compression": compression_type, +2426 } +2427 +2428 # JSON +2429 elif output_type in ["json"]: +2430 query_export_format = "FORMAT JSON, ARRAY TRUE" +2431 include_header = False +2432 post_process = True +2433 if order_by_clean: +2434 order_by_sql = f"ORDER BY {order_by_clean}" +2435 # Export options +2436 if not compression_type: +2437 compression_type = "gzip" 2438 export_options = { -2439 "format": "PARQUET", -2440 } -2441 include_header = False -2442 post_process = True +2439 "format": "JSON", +2440 "array": True, +2441 "compression": compression_type, +2442 } 2443 -2444 # BED -2445 elif output_type in ["bed"]: -2446 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER" -2447 include_header = True -2448 post_process = True -2449 if order_by_clean: -2450 order_by_sql = f"ORDER BY {order_by_clean}" -2451 # Export options -2452 if not compression_type: -2453 compression_type = "gzip" -2454 export_options = { -2455 "format": "CSV", -2456 "delimiter": delimiter, -2457 "header": True, -2458 "quote": None, -2459 "compression": compression_type, -2460 } -2461 -2462 # duckDB -2463 elif output_type in ["duckdb"]: -2464 -2465 # Needed column -2466 needed_columns = [] -2467 -2468 # Export database as Parquet -2469 database_export_parquet_file = os.path.join(tmp_dir, "output.parquet") -2470 self.export( -2471 database=database, output_database=database_export_parquet_file -2472 ) -2473 -2474 # Create database and connexion -2475 output_database_conn = duckdb.connect(output_database) -2476 -2477 # Create table in database connexion with Parquet file -2478 query_copy = f""" -2479 CREATE TABLE {table} -2480 AS {self.get_sql_database_link(database=database_export_parquet_file)} -2481 """ -2482 output_database_conn.execute(query_copy) +2444 # Parquet +2445 elif output_type in ["parquet"]: +2446 query_export_format = "FORMAT PARQUET" +2447 # Export options +2448 export_options = { +2449 "format": "PARQUET", +2450 } +2451 include_header = False +2452 post_process = True +2453 +2454 # BED +2455 elif output_type in ["bed"]: +2456 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER" +2457 include_header = True +2458 post_process = True +2459 if order_by_clean: +2460 order_by_sql = f"ORDER BY {order_by_clean}" +2461 # Export options +2462 if not compression_type: +2463 compression_type = "gzip" +2464 export_options = { +2465 "format": "CSV", +2466 "delimiter": delimiter, +2467 "header": True, +2468 "quote": None, +2469 "compression": compression_type, +2470 } +2471 +2472 # duckDB +2473 elif output_type in ["duckdb"]: +2474 +2475 # Needed column +2476 needed_columns = [] +2477 +2478 # Export database as Parquet +2479 database_export_parquet_file = os.path.join(tmp_dir, "output.parquet") +2480 self.export( +2481 database=database, output_database=database_export_parquet_file +2482 ) 2483 -2484 # Close connexion -2485 output_database_conn.close() +2484 # Create database and connexion +2485 output_database_conn = duckdb.connect(output_database) 2486 -2487 # remove tmp -2488 remove_if_exists([database_export_parquet_file]) -2489 -2490 return os.path.exists(output_database) -2491 -2492 # Partition -2493 if parquet_partitions: -2494 parquet_partitions_clean = [] -2495 parquet_partitions_array = [] -2496 for parquet_partition in parquet_partitions: -2497 parquet_partitions_array.append( -2498 parquet_partition.translate({'"': None, "'": None, " ": None}) -2499 ) -2500 parquet_partitions_clean.append( -2501 '"' -2502 + parquet_partition.translate({'"': None, "'": None, " ": None}) -2503 + '"' -2504 ) -2505 parquet_partitions_by = ",".join(parquet_partitions_clean) -2506 query_export_format += ( -2507 f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE" -2508 ) -2509 export_options["partition_by"] = parquet_partitions_array -2510 if export_options.get("format", None) == "CSV": -2511 export_mode = "duckdb" -2512 post_process_just_move = True -2513 -2514 # Construct query columns -2515 query_columns = [] -2516 -2517 # Add Needed columns -2518 for needed_column in needed_columns: -2519 if needed_columns[needed_column]: -2520 query_column_name = needed_columns[needed_column] -2521 query_column = f""" "{needed_columns[needed_column]}" """ -2522 else: -2523 query_column_name = default_empty_value -2524 query_column = f""" '{default_empty_value}' """ -2525 query_column_as = f""" "{needed_column}" """ -2526 if query_column_name == needed_column: -2527 query_columns.append(f""" {query_column} """) -2528 else: -2529 query_columns.append(f""" {query_column} AS {query_column_as} """) -2530 -2531 # Add Extra columns -2532 for extra_column in extra_columns: -2533 if extra_column not in needed_columns: -2534 # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """) -2535 query_columns.append(f""" "{extra_column}" """) -2536 -2537 # Query export columns -2538 query_export_columns = f""" {",".join(query_columns)} """ -2539 -2540 if query_columns: -2541 -2542 # Compressed tmp file -2543 query_output_database_tmp = os.path.join(tmp_dir, "output") -2544 -2545 # Query -2546 # If no query, generate query of the database -2547 if not query: -2548 query = f""" -2549 SELECT {query_export_columns} -2550 FROM {self.get_sql_database_link(database=database)} -2551 {order_by_sql} -2552 """ -2553 -2554 # Test empty query -2555 df = self.conn.execute(query).fetch_record_batch(1) -2556 query_empty = True -2557 for d in df: -2558 query_empty = False -2559 break -2560 if query_empty: -2561 log.error("Export failed: Empty") -2562 raise ValueError("Export failed: Empty") +2487 # Create table in database connexion with Parquet file +2488 query_copy = f""" +2489 CREATE TABLE {table} +2490 AS {self.get_sql_database_link(database=database_export_parquet_file)} +2491 """ +2492 output_database_conn.execute(query_copy) +2493 +2494 # Close connexion +2495 output_database_conn.close() +2496 +2497 # remove tmp +2498 remove_if_exists([database_export_parquet_file]) +2499 +2500 return os.path.exists(output_database) +2501 +2502 # Partition +2503 if parquet_partitions: +2504 parquet_partitions_clean = [] +2505 parquet_partitions_array = [] +2506 for parquet_partition in parquet_partitions: +2507 parquet_partitions_array.append( +2508 parquet_partition.translate({'"': None, "'": None, " ": None}) +2509 ) +2510 parquet_partitions_clean.append( +2511 '"' +2512 + parquet_partition.translate({'"': None, "'": None, " ": None}) +2513 + '"' +2514 ) +2515 parquet_partitions_by = ",".join(parquet_partitions_clean) +2516 query_export_format += ( +2517 f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE" +2518 ) +2519 export_options["partition_by"] = parquet_partitions_array +2520 if export_options.get("format", None) == "CSV": +2521 export_mode = "duckdb" +2522 post_process_just_move = True +2523 +2524 # Construct query columns +2525 query_columns = [] +2526 +2527 # Add Needed columns +2528 for needed_column in needed_columns: +2529 if needed_columns[needed_column]: +2530 query_column_name = needed_columns[needed_column] +2531 query_column = f""" "{needed_columns[needed_column]}" """ +2532 else: +2533 query_column_name = default_empty_value +2534 query_column = f""" '{default_empty_value}' """ +2535 query_column_as = f""" "{needed_column}" """ +2536 if query_column_name == needed_column: +2537 query_columns.append(f""" {query_column} """) +2538 else: +2539 query_columns.append(f""" {query_column} AS {query_column_as} """) +2540 +2541 # Add Extra columns +2542 for extra_column in extra_columns: +2543 if extra_column not in needed_columns: +2544 # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """) +2545 query_columns.append(f""" "{extra_column}" """) +2546 +2547 # Query export columns +2548 query_export_columns = f""" {",".join(query_columns)} """ +2549 +2550 if query_columns: +2551 +2552 # Compressed tmp file +2553 query_output_database_tmp = os.path.join(tmp_dir, "output") +2554 +2555 # Query +2556 # If no query, generate query of the database +2557 if not query: +2558 query = f""" +2559 SELECT {query_export_columns} +2560 FROM {self.get_sql_database_link(database=database)} +2561 {order_by_sql} +2562 """ 2563 -2564 # Schema names -2565 schema_names = None -2566 -2567 # Export mode pyarrow -2568 if export_mode == "pyarrow": -2569 -2570 # Compression mode -2571 # If compress required and compression type as gzip or bgzip -2572 # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1) -2573 # to reduce tmp file size -2574 compression_mode_gzip = compressed and ( -2575 export_options.get("compression", None) in ["gzip", "bgzip"] -2576 ) -2577 -2578 # File stream mode (str or bytes) -2579 f_mode = "" -2580 if compression_mode_gzip: -2581 f_mode = "b" -2582 -2583 if include_header: -2584 -2585 # Open stream files (uncompressed and compressed) -2586 with open(query_output_database_tmp, mode="w") as f, pgzip.open( -2587 query_output_database_tmp, -2588 mode="w", -2589 thread=threads, -2590 compresslevel=compresslevel, -2591 ) as f_gz: -2592 -2593 # Switch to compressed stream file -2594 if compression_mode_gzip: -2595 f = f_gz +2564 # Test empty query +2565 df = self.conn.execute(query).fetch_record_batch(1) +2566 query_empty = True +2567 for d in df: +2568 query_empty = False +2569 break +2570 if query_empty: +2571 log.warning("Export warning: Empty") +2572 remove_header_line = False +2573 else: +2574 remove_header_line = True +2575 +2576 # Schema names +2577 schema_names = None +2578 +2579 # Export mode pyarrow +2580 if export_mode == "pyarrow": +2581 +2582 # Compression mode +2583 # If compress required and compression type as gzip or bgzip +2584 # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1) +2585 # to reduce tmp file size +2586 compression_mode_gzip = compressed and ( +2587 export_options.get("compression", None) in ["gzip", "bgzip"] +2588 ) +2589 +2590 # File stream mode (str or bytes) +2591 f_mode = "" +2592 if compression_mode_gzip: +2593 f_mode = "b" +2594 +2595 if include_header: 2596 -2597 # Generate header tmp file -2598 query_output_header_tmp = os.path.join(tmp_dir, "header") -2599 self.get_header_file( -2600 header_file=query_output_header_tmp, -2601 remove_header_line=True, -2602 sql_query=query, -2603 ) +2597 # Open stream files (uncompressed and compressed) +2598 with open(query_output_database_tmp, mode="w") as f, pgzip.open( +2599 query_output_database_tmp, +2600 mode="w", +2601 thread=threads, +2602 compresslevel=compresslevel, +2603 ) as f_gz: 2604 -2605 # Write header to tmp file -2606 with open( -2607 query_output_header_tmp, "r" + f_mode -2608 ) as output_header_tmp: -2609 f.write(output_header_tmp.read()) -2610 -2611 # JSON format - Add special "[" character at the beginning of the file -2612 if export_options.get("format") in ["JSON"]: -2613 -2614 # Open stream files (uncompressed and compressed) -2615 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2616 query_output_database_tmp, -2617 mode="a", -2618 thread=threads, -2619 compresslevel=compresslevel, -2620 ) as f_gz: -2621 -2622 # Switch to compressed stream file -2623 if compression_mode_gzip: -2624 f = f_gz -2625 f.write(b"[\n") -2626 else: -2627 f.write("[\n") -2628 -2629 # Open stream files (uncompressed and compressed) for chunk -2630 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2631 query_output_database_tmp, -2632 mode="a", -2633 thread=threads, -2634 compresslevel=compresslevel, -2635 ) as f_gz: -2636 -2637 # Switch to compressed stream file -2638 if compression_mode_gzip: -2639 f = f_gz +2605 # Switch to compressed stream file +2606 if compression_mode_gzip: +2607 f = f_gz +2608 +2609 # Generate header tmp file +2610 query_output_header_tmp = os.path.join(tmp_dir, "header") +2611 self.get_header_file( +2612 header_file=query_output_header_tmp, +2613 remove_header_line=remove_header_line, +2614 sql_query=query, +2615 ) +2616 +2617 # Write header to tmp file +2618 with open( +2619 query_output_header_tmp, "r" + f_mode +2620 ) as output_header_tmp: +2621 f.write(output_header_tmp.read()) +2622 +2623 # JSON format - Add special "[" character at the beginning of the file +2624 if export_options.get("format") in ["JSON"]: +2625 +2626 # Open stream files (uncompressed and compressed) +2627 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2628 query_output_database_tmp, +2629 mode="a", +2630 thread=threads, +2631 compresslevel=compresslevel, +2632 ) as f_gz: +2633 +2634 # Switch to compressed stream file +2635 if compression_mode_gzip: +2636 f = f_gz +2637 f.write(b"[\n") +2638 else: +2639 f.write("[\n") 2640 -2641 # Chunk query with batch of dataframes of chunk_size -2642 df = self.conn.execute(query).fetch_record_batch(chunk_size) -2643 -2644 # id of chunk -2645 i = 0 -2646 -2647 # For each chunk dataframe -2648 for d in df: -2649 -2650 # Schema names -2651 schema_names = d.schema.names +2641 # Open stream files (uncompressed and compressed) for chunk +2642 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2643 query_output_database_tmp, +2644 mode="a", +2645 thread=threads, +2646 compresslevel=compresslevel, +2647 ) as f_gz: +2648 +2649 # Switch to compressed stream file +2650 if compression_mode_gzip: +2651 f = f_gz 2652 -2653 # id of chunk -2654 i += 1 +2653 # Chunk query with batch of dataframes of chunk_size +2654 df = self.conn.execute(query).fetch_record_batch(chunk_size) 2655 -2656 # Log - number of records -2657 log.debug(f"Chunk {i}: records process...") +2656 # id of chunk +2657 i = 0 2658 -2659 # Check process for first chunk -2660 if i == 1: +2659 # For each chunk dataframe +2660 for d in df: 2661 -2662 # If include header in file -2663 header = export_options.get("header", True) +2662 # Schema names +2663 schema_names = d.schema.names 2664 -2665 # Parquet output format -2666 # Either a folder or a writer -2667 if export_options.get("format") in ["PARQUET"]: -2668 -2669 # For Parquet with multiple file - folder -2670 if export_options.get( -2671 "partition_by", None -2672 ) or export_options.get("per_thread_output", False): -2673 query_output_database_tmp = ( -2674 f"{query_output_database_tmp}.parquet" -2675 ) +2665 # id of chunk +2666 i += 1 +2667 +2668 # Log - number of records +2669 log.debug(f"Chunk {i}: records process...") +2670 +2671 # Check process for first chunk +2672 if i == 1: +2673 +2674 # If include header in file +2675 header = export_options.get("header", True) 2676 -2677 # For Parquet as a unique file - writer -2678 else: -2679 writer = pq.ParquetWriter( -2680 query_output_database_tmp, d.schema -2681 ) -2682 -2683 else: -2684 -2685 # Switch of header in file for not first chunk -2686 header = False -2687 -2688 # CSV format -2689 if export_options.get("format") in ["CSV"]: -2690 -2691 # With quote option -2692 if export_options.get("quote", None): -2693 -2694 # Polars write dataframe -2695 pl.from_arrow(d).write_csv( -2696 file=f, -2697 separator=export_options.get("delimiter", ""), -2698 include_header=header, -2699 quote_char=export_options.get("quote", '"'), -2700 ) -2701 -2702 # Without quote option -2703 else: -2704 -2705 # Polars write dataframe -2706 pl.from_arrow(d).write_csv( -2707 file=f, -2708 separator=export_options.get("delimiter", ""), -2709 include_header=header, -2710 quote_style="never", -2711 ) -2712 -2713 # JSON format -2714 elif export_options.get("format") in ["JSON"]: -2715 -2716 # Compressed mode gzip -2717 if compression_mode_gzip: -2718 -2719 # Add comma at the beginning of dataframe (if not the first one) in bytes mode -2720 if i > 1: -2721 f.write(b",\n") -2722 -2723 # Write dataframe in bytes mode -2724 f.write( -2725 str.encode( -2726 pl.from_arrow(d) -2727 .write_ndjson() -2728 .replace("\n{", ",\n{") -2729 .replace("[", "") -2730 .replace("]", "") -2731 ) -2732 ) -2733 -2734 # Not compressed mode gzip (string mode) -2735 else: -2736 -2737 # Add comma at the beginning of dataframe (if not the first one) in string mode -2738 if i > 1: -2739 f.write(",\n") -2740 -2741 # Write dataframe in string mode -2742 f.write( -2743 pl.from_arrow(d) -2744 .write_ndjson() -2745 .replace("\n{", ",\n{") -2746 .replace("[", "") -2747 .replace("]", "") -2748 ) -2749 -2750 # Parquet format -2751 elif export_options.get("format") in ["PARQUET"]: +2677 # Parquet output format +2678 # Either a folder or a writer +2679 if export_options.get("format") in ["PARQUET"]: +2680 +2681 # For Parquet with multiple file - folder +2682 if export_options.get( +2683 "partition_by", None +2684 ) or export_options.get("per_thread_output", False): +2685 query_output_database_tmp = ( +2686 f"{query_output_database_tmp}.parquet" +2687 ) +2688 +2689 # For Parquet as a unique file - writer +2690 else: +2691 writer = pq.ParquetWriter( +2692 query_output_database_tmp, d.schema +2693 ) +2694 +2695 else: +2696 +2697 # Switch of header in file for not first chunk +2698 header = False +2699 +2700 # CSV format +2701 if export_options.get("format") in ["CSV"]: +2702 +2703 # With quote option +2704 if export_options.get("quote", None): +2705 +2706 # Polars write dataframe +2707 pl.from_arrow(d).write_csv( +2708 file=f, +2709 separator=export_options.get("delimiter", ""), +2710 include_header=header, +2711 quote_char=export_options.get("quote", '"'), +2712 ) +2713 +2714 # Without quote option +2715 else: +2716 +2717 # Polars write dataframe +2718 pl.from_arrow(d).write_csv( +2719 file=f, +2720 separator=export_options.get("delimiter", ""), +2721 include_header=header, +2722 quote_style="never", +2723 ) +2724 +2725 # JSON format +2726 elif export_options.get("format") in ["JSON"]: +2727 +2728 # Compressed mode gzip +2729 if compression_mode_gzip: +2730 +2731 # Add comma at the beginning of dataframe (if not the first one) in bytes mode +2732 if i > 1: +2733 f.write(b",\n") +2734 +2735 # Write dataframe in bytes mode +2736 f.write( +2737 str.encode( +2738 pl.from_arrow(d) +2739 .write_ndjson() +2740 .replace("\n{", ",\n{") +2741 .replace("[", "") +2742 .replace("]", "") +2743 ) +2744 ) +2745 +2746 # Not compressed mode gzip (string mode) +2747 else: +2748 +2749 # Add comma at the beginning of dataframe (if not the first one) in string mode +2750 if i > 1: +2751 f.write(",\n") 2752 -2753 # Partition by fields -2754 partition_by = export_options.get("partition_by", None) -2755 -2756 if partition_by: -2757 -2758 # For No partition but split parquet files into a folder -2759 if "None" in partition_by: -2760 partition_by = None +2753 # Write dataframe in string mode +2754 f.write( +2755 pl.from_arrow(d) +2756 .write_ndjson() +2757 .replace("\n{", ",\n{") +2758 .replace("[", "") +2759 .replace("]", "") +2760 ) 2761 -2762 # Pyarrow write -2763 pq.write_to_dataset( -2764 pa.Table.from_batches([d]), -2765 query_output_database_tmp, -2766 partition_cols=partition_by, -2767 use_threads=threads, -2768 existing_data_behavior="overwrite_or_ignore", -2769 ) -2770 -2771 # Parquet in unique file -2772 else: -2773 writer.write_batch(d) -2774 -2775 # Close Parquet writer -2776 if ( -2777 export_options.get("format") in ["PARQUET"] -2778 and not export_options.get("partition_by", None) -2779 and not export_options.get("per_thread_output", None) -2780 ): -2781 writer.close() +2762 # Parquet format +2763 elif export_options.get("format") in ["PARQUET"]: +2764 +2765 # Partition by fields +2766 partition_by = export_options.get("partition_by", None) +2767 +2768 if partition_by: +2769 +2770 # For No partition but split parquet files into a folder +2771 if "None" in partition_by: +2772 partition_by = None +2773 +2774 # Pyarrow write +2775 pq.write_to_dataset( +2776 pa.Table.from_batches([d]), +2777 query_output_database_tmp, +2778 partition_cols=partition_by, +2779 use_threads=threads, +2780 existing_data_behavior="overwrite_or_ignore", +2781 ) 2782 -2783 # JSON format - Add special "]" character at the end of the file -2784 if export_options.get("format") in ["JSON"]: -2785 -2786 # Open stream files (uncompressed and compressed) -2787 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2788 query_output_database_tmp, -2789 mode="a", -2790 thread=threads, -2791 compresslevel=compresslevel, -2792 ) as f_gz: -2793 -2794 # Switch to compressed stream file -2795 if compression_mode_gzip: -2796 f = f_gz -2797 f.write(b"]\n") -2798 else: -2799 f.write("]\n") -2800 -2801 # Export mode duckdb -2802 elif export_mode == "duckdb": -2803 -2804 # Create COPY TO query -2805 query_copy = f""" -2806 {query_set} -2807 COPY ( -2808 {query} -2809 ) -2810 TO '{query_output_database_tmp}' -2811 WITH ({query_export_format}) -2812 """ -2813 -2814 # Export with duckdb -2815 self.query(query=query_copy) -2816 -2817 # Export mode unknown -2818 else: -2819 log.error(f"Export mode '{export_mode}' unknown") -2820 raise ValueError(f"Export mode '{export_mode}' unknown") -2821 -2822 # Post process -2823 if post_process: -2824 -2825 # Log - number of records -2826 log.debug("Post processing...") -2827 -2828 # Input files -2829 input_files = [] -2830 -2831 # Export mode duckdb and include header -2832 if export_mode == "duckdb" and include_header: +2783 # Parquet in unique file +2784 else: +2785 writer.write_batch(d) +2786 +2787 # Close Parquet writer +2788 if ( +2789 export_options.get("format") in ["PARQUET"] +2790 and not export_options.get("partition_by", None) +2791 and not export_options.get("per_thread_output", None) +2792 ): +2793 writer.close() +2794 +2795 # JSON format - Add special "]" character at the end of the file +2796 if export_options.get("format") in ["JSON"]: +2797 +2798 # Open stream files (uncompressed and compressed) +2799 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2800 query_output_database_tmp, +2801 mode="a", +2802 thread=threads, +2803 compresslevel=compresslevel, +2804 ) as f_gz: +2805 +2806 # Switch to compressed stream file +2807 if compression_mode_gzip: +2808 f = f_gz +2809 f.write(b"]\n") +2810 else: +2811 f.write("]\n") +2812 +2813 # Export mode duckdb +2814 elif export_mode == "duckdb": +2815 +2816 # Create COPY TO query +2817 query_copy = f""" +2818 {query_set} +2819 COPY ( +2820 {query} +2821 ) +2822 TO '{query_output_database_tmp}' +2823 WITH ({query_export_format}) +2824 """ +2825 +2826 # Export with duckdb +2827 self.query(query=query_copy) +2828 +2829 # Export mode unknown +2830 else: +2831 log.error(f"Export mode '{export_mode}' unknown") +2832 raise ValueError(f"Export mode '{export_mode}' unknown") 2833 -2834 # create tmp header file -2835 query_output_header_tmp = os.path.join(tmp_dir, "header") -2836 tmp_files.append(query_output_header_tmp) -2837 self.get_header_file( -2838 header_file=query_output_header_tmp, remove_header_line=True -2839 ) -2840 -2841 # Add tmp header file for concat and compress -2842 input_files.append(query_output_header_tmp) -2843 -2844 # Add variants file -2845 input_files.append(query_output_database_tmp) -2846 -2847 # Output with concat and compress -2848 if not post_process_just_move and ( -2849 export_mode == "duckdb" -2850 or ( -2851 compressed -2852 and export_options.get("compression", None) == "bgzip" -2853 ) -2854 or sort -2855 or index -2856 ): -2857 -2858 # Compression type -2859 if not compressed: -2860 compression_type = "none" -2861 else: -2862 compression_type = export_options.get( -2863 "compression", "bgzip" -2864 ) -2865 -2866 # Concat and compress -2867 concat_and_compress_files( -2868 input_files=input_files, -2869 output_file=output_database, -2870 compression_type=compression_type, -2871 threads=threads, -2872 sort=sort, -2873 index=index, -2874 ) -2875 -2876 # Output already generated file (either compressed in gzip or not compressed, with included header if needed) -2877 else: -2878 -2879 # Move tmp file -2880 shutil.move(query_output_database_tmp, output_database) -2881 -2882 # Generate associated header file -2883 if output_header and export_header: -2884 -2885 # Log - Generate header -2886 log.debug("Generate header...") +2834 # Post process +2835 if post_process: +2836 +2837 # Log - number of records +2838 log.debug("Post processing...") +2839 +2840 # Input files +2841 input_files = [] +2842 +2843 # Export mode duckdb and include header +2844 if export_mode == "duckdb" and include_header: +2845 +2846 # create tmp header file +2847 query_output_header_tmp = os.path.join(tmp_dir, "header") +2848 tmp_files.append(query_output_header_tmp) +2849 self.get_header_file( +2850 header_file=query_output_header_tmp, remove_header_line=remove_header_line +2851 ) +2852 +2853 # Add tmp header file for concat and compress +2854 input_files.append(query_output_header_tmp) +2855 +2856 # Add variants file +2857 input_files.append(query_output_database_tmp) +2858 +2859 # Output with concat and compress +2860 if not post_process_just_move and ( +2861 export_mode == "duckdb" +2862 or ( +2863 compressed +2864 and export_options.get("compression", None) == "bgzip" +2865 ) +2866 or sort +2867 or index +2868 ): +2869 +2870 # Compression type +2871 if not compressed: +2872 compression_type = "none" +2873 else: +2874 compression_type = export_options.get( +2875 "compression", "bgzip" +2876 ) +2877 +2878 # Concat and compress +2879 concat_and_compress_files( +2880 input_files=input_files, +2881 output_file=output_database, +2882 compression_type=compression_type, +2883 threads=threads, +2884 sort=sort, +2885 index=index, +2886 ) 2887 -2888 # Create database -2889 database_for_header = Database(database=output_database) +2888 # Output already generated file (either compressed in gzip or not compressed, with included header if needed) +2889 else: 2890 -2891 # Remove header if exists -2892 remove_if_exists([output_header]) +2891 # Move tmp file +2892 shutil.move(query_output_database_tmp, output_database) 2893 -2894 # Find columns in database -2895 if schema_names: -2896 header_columns_from_database = schema_names -2897 else: -2898 header_columns_from_database = ( -2899 database_for_header.get_header_columns_from_database( -2900 database=output_database -2901 ) -2902 ) -2903 -2904 # Generate header file -2905 database_for_header.get_header_file( -2906 header_file=output_header, -2907 replace_header_line=header_columns_from_database, -2908 force=True, -2909 ) -2910 -2911 # Clean tmp files (deprecated) -2912 remove_if_exists(tmp_files) -2913 -2914 # Return if file exists -2915 return os.path.exists(output_database) and self.get_type(output_database) +2894 # Generate associated header file +2895 if output_header and export_header: +2896 +2897 # Log - Generate header +2898 log.debug("Generate header...") +2899 +2900 # Create database +2901 database_for_header = Database(database=output_database) +2902 +2903 # Remove header if exists +2904 remove_if_exists([output_header]) +2905 +2906 # Find columns in database +2907 if schema_names: +2908 header_columns_from_database = schema_names +2909 else: +2910 header_columns_from_database = ( +2911 database_for_header.get_header_columns_from_database( +2912 database=output_database +2913 ) +2914 ) +2915 +2916 # Generate header file +2917 database_for_header.get_header_file( +2918 header_file=output_header, +2919 replace_header_line=header_columns_from_database, +2920 force=True, +2921 ) +2922 +2923 # Clean tmp files (deprecated) +2924 remove_if_exists(tmp_files) +2925 +2926 # Return if file exists +2927 return os.path.exists(output_database) and self.get_type(output_database)6234 """ 6235 ) 6236 sql_query_annotation_to_agregate.append( - 6237 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ + 6237 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6238 ) 6239 6240 # Not to annotate @@ -18422,7 +18422,7 @@
6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( - 6238 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ + 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate @@ -32655,7 +32655,7 @@
Returns
6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( -6238 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ +6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate diff --git a/docs/tips.pdf b/docs/tips.pdf index e615578..d637cc2 100644 Binary files a/docs/tips.pdf and b/docs/tips.pdf differ diff --git a/docs/user_guide.pdf b/docs/user_guide.pdf index c099f38..f730d1e 100644 Binary files a/docs/user_guide.pdf and b/docs/user_guide.pdf differ diff --git a/plugins/README.pdf b/plugins/README.pdf index 40f2849..3c604a5 100644 Binary files a/plugins/README.pdf and b/plugins/README.pdf differ