diff --git a/README.pdf b/README.pdf index 286883f..8af8054 100644 Binary files a/README.pdf and b/README.pdf differ diff --git a/RELEASE_NOTES.html b/RELEASE_NOTES.html index 409a3f9..069de39 100644 --- a/RELEASE_NOTES.html +++ b/RELEASE_NOTES.html @@ -228,6 +228,8 @@

Fix packages releases
  • Fix parameters and configuration files options
  • Fix calculations list and parametrization
  • +
  • Fix empty file export
  • +
  • Fix BED annotation with parquet method
  • More explicite log messages
  • 1809 "bed", 1810 "json", 1811 ]: -1812 sql_from = self.get_sql_from( -1813 database=database, header_file=header_file -1814 ) -1815 sql_query = f"SELECT * FROM {sql_from} LIMIT 0" -1816 return list(self.conn.query(sql_query).columns) -1817 except ValueError: -1818 return [] -1819 -1820 return [] -1821 -1822 def get_table_columns_from_format(self, database: str = None) -> list: -1823 """ -1824 The function `get_table_columns_from_format` returns a list of table columns based on the -1825 specified database format. +1812 # Query +1813 sql_from = self.get_sql_from( +1814 database=database, header_file=header_file +1815 ) +1816 sql_query = f"SELECT * FROM {sql_from} LIMIT 0" +1817 +1818 # Get columns +1819 result_description = self.conn.execute(sql_query).description +1820 +1821 # Extract columns' names +1822 columns = [desc[0] for desc in result_description] +1823 +1824 # Return columns as list +1825 return columns 1826 -1827 :param database: The `database` parameter is a string that represents the name of the database. -1828 It is an optional parameter, which means it has a default value of `None`. If no value is -1829 provided for the `database` parameter, the `get_database()` method is called to retrieve the -1830 current database name -1831 :type database: str -1832 :return: a list of table columns. -1833 """ -1834 -1835 table_columns = None +1827 except ValueError: +1828 return [] +1829 +1830 return [] +1831 +1832 def get_table_columns_from_format(self, database: str = None) -> list: +1833 """ +1834 The function `get_table_columns_from_format` returns a list of table columns based on the +1835 specified database format. 1836 -1837 if not database: -1838 database = self.get_database() -1839 -1840 database_format = self.get_format(database) -1841 -1842 needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None) -1843 if needed_columns: -1844 table_columns = list(needed_columns.keys()) -1845 else: -1846 table_columns = [] -1847 -1848 return table_columns +1837 :param database: The `database` parameter is a string that represents the name of the database. +1838 It is an optional parameter, which means it has a default value of `None`. If no value is +1839 provided for the `database` parameter, the `get_database()` method is called to retrieve the +1840 current database name +1841 :type database: str +1842 :return: a list of table columns. +1843 """ +1844 +1845 table_columns = None +1846 +1847 if not database: +1848 database = self.get_database() 1849 -1850 def get_table_columns_from_file( -1851 self, -1852 database: str = None, -1853 header_file: str = None, -1854 header_file_find: bool = True, -1855 ) -> list: -1856 """ -1857 The function `get_table_columns_from_file` retrieves the column names from a database or header -1858 file. +1850 database_format = self.get_format(database) +1851 +1852 needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None) +1853 if needed_columns: +1854 table_columns = list(needed_columns.keys()) +1855 else: +1856 table_columns = [] +1857 +1858 return table_columns 1859 -1860 :param database: The `database` parameter is a string that represents the name or path of the -1861 database file. If this parameter is not provided, the `get_database()` method is called to -1862 retrieve the database name or path -1863 :type database: str -1864 :param header_file: The `header_file` parameter is a string that represents the file path or -1865 name of the header file. This file contains the header information for a table, which typically -1866 includes the names of the columns in the table -1867 :type header_file: str -1868 :param header_file_find: Allow header file find if not provided -1869 :type header_file_find: bool -1870 :return: a list of table columns. -1871 """ -1872 -1873 table_columns = None -1874 -1875 if not database: -1876 database = self.get_database() -1877 -1878 if not header_file and header_file_find: -1879 header_file = self.get_header_file() -1880 -1881 if not header_file and header_file_find: -1882 header_file = self.find_header_file(database) -1883 -1884 database_format = self.get_format(database=database) -1885 delimiter = SEP_TYPE.get(database_format, "\t") -1886 -1887 # Try from database file -1888 try: -1889 table_header = self.read_header_file(database) -1890 except ValueError: -1891 table_header = None -1892 -1893 if table_header: -1894 try: -1895 table_columns = ( -1896 table_header[self.get_header_length(header_file=table_header)] -1897 .strip() -1898 .split(delimiter) -1899 ) -1900 except IndexError: -1901 table_columns = None -1902 else: -1903 table_columns = None -1904 -1905 if not table_columns: -1906 # Try from header file -1907 try: -1908 table_header = self.read_header_file(header_file) -1909 except ValueError: -1910 table_header = None -1911 -1912 if table_header: -1913 try: -1914 table_columns = ( -1915 table_header[self.get_header_length(header_file=table_header)] -1916 .strip() -1917 .split(delimiter) -1918 ) -1919 except IndexError: -1920 table_columns = None -1921 else: -1922 table_columns = None -1923 -1924 return table_columns -1925 -1926 def get_annotations(self, database: str = None) -> object: -1927 """ -1928 This function returns the annotations of a database or the default database if none is -1929 specified. -1930 -1931 :param database: The parameter `database` is a string that represents the name of the database -1932 to retrieve annotations from. If no database name is provided, the method will use the default -1933 database name obtained from the `get_database()` method -1934 :type database: str -1935 :return: The function `get_annotations` returns the `infos` attribute of the header of a -1936 database. If the `database` parameter is not provided, it gets the current database using the -1937 `get_database` method. If there is no header, it returns `None`. -1938 """ -1939 -1940 if not database: -1941 database = self.get_database() -1942 -1943 if self.get_header(database=database): -1944 return self.get_header(database=database).infos -1945 else: -1946 return None -1947 -1948 def get_extra_columns( -1949 self, database: str = None, database_type: str = None, sql_query: str = None -1950 ) -> list: -1951 """ -1952 This Python function returns a list of extra columns in a database table that are not needed -1953 based on the database type and existing columns. -1954 -1955 :param database: A string representing the name of the database to retrieve columns from. If -1956 None is provided, the default database will be used -1957 :type database: str -1958 :param database_type: The `database_type` parameter in the `get_extra_columns` function -1959 represents the type of the database for which you want to retrieve the list of extra columns. It -1960 is used to determine which columns are needed based on the database type and the existing -1961 columns in the specified database table -1962 :type database_type: str -1963 :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass -1964 an SQL query that can be used to retrieve specific columns from the database. This query can be -1965 customized to filter columns based on certain conditions or criteria before analyzing them to -1966 determine the extra columns that are not needed -1967 :type sql_query: str -1968 :return: A list of extra columns in a database table that are not needed based on the database -1969 type and existing columns. -1970 """ -1971 -1972 if not database: -1973 database = self.get_database() -1974 -1975 if not database: -1976 return [] -1977 -1978 existing_columns = self.get_columns( -1979 database=database, -1980 table=self.get_database_table(database), -1981 sql_query=sql_query, -1982 ) -1983 if not database_type: -1984 database_type = self.get_type(database=database, sql_query=sql_query) -1985 needed_columns = self.get_needed_columns( -1986 database_columns=existing_columns, database_type=database_type -1987 ) -1988 -1989 extra_columns = existing_columns.copy() -1990 -1991 for needed_col in needed_columns: -1992 if needed_columns.get(needed_col) in extra_columns: -1993 extra_columns.remove(needed_columns.get(needed_col)) -1994 -1995 return extra_columns -1996 -1997 def is_vcf(self, database: str = None, sql_query: str = None) -> bool: -1998 """ -1999 The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and -2000 their types. -2001 -2002 :param database: The `database` parameter in the `is_vcf` function is a string that represents -2003 the name of the database that the function will use to check if the file is a VCF (Variant Call -2004 Format) file. If the `database` parameter is not provided when calling the function, it will -2005 :type database: str -2006 :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL -2007 query string that can be used to filter the columns retrieved from the database. This query can -2008 be used to narrow down the columns that are considered when checking if the database is of type -2009 "vcf" -2010 :type sql_query: str -2011 :return: The function `is_vcf` returns a boolean value indicating whether the database type is -2012 "vcf" or not. -2013 """ -2014 -2015 if not database: -2016 database = self.get_database() -2017 -2018 if not database: -2019 return False -2020 -2021 database_columns = self.get_columns( -2022 database=database, -2023 table=self.get_database_table(database), -2024 sql_query=sql_query, -2025 ) -2026 -2027 # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples) -2028 return self.get_type_from_columns( -2029 database_columns=database_columns, check_database_type="vcf" -2030 ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8) -2031 -2032 def get_conn(self): -2033 """ -2034 The function returns the connection object. -2035 :return: The method is returning the value of the instance variable `self.conn`. -2036 """ -2037 -2038 return self.conn -2039 -2040 def is_genotype_column( -2041 self, -2042 column: str, -2043 database: str = None, -2044 downsampling: int = 1000, -2045 check_format: bool = True, -2046 ) -> bool: -2047 """ -2048 The `is_genotype_column` function in Python checks if a specified column in a database contains -2049 genotype data based on a regular expression pattern. -2050 -2051 :param column: The `column` parameter is a string that represents the name of a column in a -2052 database table. It is used to specify the column for which you want to check if it contains -2053 genotype information based on a regular expression pattern -2054 :type column: str -2055 :param database: The `database` parameter in the `is_genotype_column` method is used to specify -2056 the name of the database from which the data will be queried. If a database is provided, the -2057 method will query the specified database to check if the given column contains genotype -2058 information. If no database is provided, -2059 :type database: str -2060 :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an -2061 integer value that determines the number of rows to be sampled from the database table when -2062 checking for genotype information in the specified column. This parameter is used to limit the -2063 number of rows to be processed in order to improve performance, defaults to 1000 -2064 :type downsampling: int (optional) -2065 :param check_format: The `check_format` parameter in the `is_genotype_column` method is a -2066 boolean flag that determines whether the function should check the format of the data before -2067 proceeding with the genotype column analysis. If `check_format` is set to `True`, the function -2068 will verify if the specified column exists in, defaults to True -2069 :type check_format: bool (optional) -2070 :return: The `is_genotype_column` method returns a boolean value. If the specified column in a -2071 database table contains genotype information, it returns `True`; otherwise, it returns `False`. -2072 """ -2073 -2074 # Table variants -2075 table_variants_from = self.get_sql_database_link(database=database) -2076 -2077 # Check if format column is present -2078 if check_format: -2079 query = f""" -2080 SELECT * FROM {table_variants_from} -2081 LIMIT 0 -2082 """ -2083 df = self.query(query=query) -2084 if "FORMAT" not in df.columns or column not in df.columns: -2085 return False -2086 query_format = f""" -2087 AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$')) -2088 """ -2089 else: -2090 query_format = "" -2091 -2092 # Query number of samples -2093 query_downsampling = f""" -2094 SELECT "{column}", FORMAT -2095 FROM {table_variants_from} -2096 LIMIT {downsampling} -2097 """ -2098 df_downsampling = self.query(query=query_downsampling) -2099 -2100 # Query to check genotype -2101 query_genotype = f""" -2102 SELECT * -2103 FROM df_downsampling -2104 WHERE ( -2105 regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*') -2106 {query_format} -2107 ) -2108 """ -2109 df_genotype = self.query(query=query_genotype) -2110 -2111 # return -2112 return len(df_genotype) == len(df_downsampling) -2113 -2114 def export( -2115 self, -2116 output_database: str, -2117 output_header: str = None, -2118 header_in_output: bool = True, -2119 database: str = None, -2120 table: str = "variants", -2121 parquet_partitions: list = None, -2122 threads: int = 1, -2123 sort: bool = False, -2124 index: bool = False, -2125 existing_columns_header: list = [], -2126 order_by: str = "", -2127 query: str = None, -2128 compression_type: str = None, -2129 chunk_size: int = 1000000, -2130 export_mode: str = "pyarrow", -2131 compresslevel: int = 6, -2132 export_header: bool = True, -2133 sample_list: list = None, -2134 ) -> bool: -2135 """ -2136 The `export` function exports data from a database to a specified output format, compresses it -2137 if necessary, and returns a boolean value indicating whether the export was successful or not. -2138 -2139 :param output_database: The `output_database` parameter is a string that represents the path and -2140 filename of the output file to be exported. It specifies where the exported data will be saved -2141 :type output_database: str -2142 :param output_header: The `output_header` parameter is an optional string that represents the -2143 header of the output file. If provided, it specifies the header that will be included in the -2144 output file. If not provided, the header will be automatically detected based on the output file -2145 format -2146 :type output_header: str -2147 :param header_in_output: The `header_in_output` parameter is a boolean value that determines -2148 whether the header should be included in the output file. If set to `True`, the header will be -2149 included in the output file. If set to `False`, the header will not be included in the output -2150 file. By default,, defaults to True -2151 :type header_in_output: bool (optional) -2152 :param database: The `database` parameter is the name of the database from which you want to -2153 export data. If this parameter is not provided, the function will use the `get_database()` -2154 method to retrieve the current database -2155 :type database: str -2156 :param table: The `table` parameter specifies the name of the table in the database from which -2157 the data will be exported. By default, if not specified, it is set to "variants", defaults to -2158 variants -2159 :type table: str (optional) -2160 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the -2161 partition columns for the Parquet output format. Each element in the list represents a partition -2162 column. The partitions are used to organize the data in the Parquet file based on the values of -2163 the specified columns -2164 :type parquet_partitions: list -2165 :param threads: The `threads` parameter in the `export` function is an optional integer that -2166 specifies the number of threads to use for exporting the data. It determines the level of -2167 parallelism during the export process. By default, it is set to 1, defaults to 1 -2168 :type threads: int (optional) -2169 :param sort: The `sort` parameter in the `export` function is a boolean value that specifies -2170 whether the output file should be sorted based on the genomic coordinates of the variants. If -2171 `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults -2172 to False -2173 :type sort: bool (optional) -2174 :param index: The `index` parameter is a boolean value that specifies whether to index the -2175 output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to -2176 `False` or not provided, the output file will not be indexed. By default,, defaults to False -2177 :type index: bool (optional) -2178 :param existing_columns_header: The `existing_columns_header` parameter is a list that -2179 represents the existing columns in the header of the output file. It is used to determine the -2180 columns that should be included in the output file. If this parameter is not provided, the -2181 function will automatically detect the header columns based on the output file format -2182 :type existing_columns_header: list -2183 :param order_by: The `order_by` parameter in the `export` function is a string that specifies -2184 the columns by which the output file should be ordered. You can specify multiple columns -2185 separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC" -2186 (descending) to specify -2187 :type order_by: str -2188 :param query: The `query` parameter in the `export` function represents a SQL query that -2189 specifies the data to be exported from the database. If provided, the function will export the -2190 result of this query. If the `query` parameter is not provided, the function will generate a -2191 query to export the data from -2192 :type query: str -2193 :param compression_type: The `compression_type` parameter in the `export` function specifies the -2194 type of compression to be applied to the output file. By default, the compression type is set to -2195 "bgzip". This parameter allows you to choose the compression algorithm for the output file, such -2196 as "gzip", "bgzip -2197 :type compression_type: str -2198 :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of -2199 each chunk or batch of data that will be processed during the export operation. It determines -2200 how many records or lines of data will be included in each chunk that is processed at a time, -2201 defaults to 1000000 -2202 :type chunk_size: int (optional) -2203 :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of -2204 export, which can be either "pyarrow" or "duckdb", defaults to pyarrow -2205 :type export_mode: str (optional) -2206 :param compresslevel: The `compresslevel` parameter in the `export` function represents the -2207 level of compression for gzip. By default, it is set to 6. This parameter allows you to specify -2208 the compression level when using gzip compression for the output file. The compression level can -2209 range from 0 (no compression), defaults to 6 -2210 :type compresslevel: int (optional) -2211 :param export_header: The `export_header` parameter is a boolean flag that determines whether -2212 the header of a VCF file should be exported to a separate file or not. If `export_header` is -2213 True, the header will be exported to a file. If `export_header` is False, the header will not -2214 be, defaults to True -2215 :type export_header: bool (optional) -2216 :param sample_list: The `sample_list` parameter in the `export` function is a list that -2217 specifies the samples to be included in the exported data. If provided, the samples listed in -2218 this parameter will be included in the output file. If not provided, the function will determine -2219 the samples to include based on the data -2220 :type sample_list: list -2221 :return: The `export` function returns a boolean value indicating whether the export was -2222 successful or not. -2223 """ -2224 -2225 # Full path -2226 output_database = full_path(output_database) -2227 output_header = full_path(output_header) -2228 database = full_path(database) -2229 -2230 # Database -2231 if not database: -2232 database = self.get_database() -2233 if not database: -2234 return False -2235 -2236 # Chunk size -2237 if not chunk_size: -2238 chunk_size = 1000000 -2239 else: -2240 chunk_size = int(chunk_size) -2241 -2242 # Export mode -2243 # Either "pyarrow" (default) or "duckdb" -2244 if not export_mode: -2245 export_mode = "pyarrow" -2246 -2247 # Compression level -2248 if not compresslevel: -2249 compresslevel = 6 -2250 -2251 # Remove output if exists -2252 remove_if_exists(output_database) -2253 -2254 # Tmp -2255 tmp_folder = os.path.dirname(output_database) -2256 if not tmp_folder: -2257 tmp_folder = "." -2258 -2259 with TemporaryDirectory( -2260 dir=tmp_folder, prefix="howard_database_export_" -2261 ) as tmp_dir: -2262 -2263 # tmp files -2264 tmp_files = [] -2265 -2266 # query_set -2267 query_set = "" +1860 def get_table_columns_from_file( +1861 self, +1862 database: str = None, +1863 header_file: str = None, +1864 header_file_find: bool = True, +1865 ) -> list: +1866 """ +1867 The function `get_table_columns_from_file` retrieves the column names from a database or header +1868 file. +1869 +1870 :param database: The `database` parameter is a string that represents the name or path of the +1871 database file. If this parameter is not provided, the `get_database()` method is called to +1872 retrieve the database name or path +1873 :type database: str +1874 :param header_file: The `header_file` parameter is a string that represents the file path or +1875 name of the header file. This file contains the header information for a table, which typically +1876 includes the names of the columns in the table +1877 :type header_file: str +1878 :param header_file_find: Allow header file find if not provided +1879 :type header_file_find: bool +1880 :return: a list of table columns. +1881 """ +1882 +1883 table_columns = None +1884 +1885 if not database: +1886 database = self.get_database() +1887 +1888 if not header_file and header_file_find: +1889 header_file = self.get_header_file() +1890 +1891 if not header_file and header_file_find: +1892 header_file = self.find_header_file(database) +1893 +1894 database_format = self.get_format(database=database) +1895 delimiter = SEP_TYPE.get(database_format, "\t") +1896 +1897 # Try from database file +1898 try: +1899 table_header = self.read_header_file(database) +1900 except ValueError: +1901 table_header = None +1902 +1903 if table_header: +1904 try: +1905 table_columns = ( +1906 table_header[self.get_header_length(header_file=table_header)] +1907 .strip() +1908 .split(delimiter) +1909 ) +1910 except IndexError: +1911 table_columns = None +1912 else: +1913 table_columns = None +1914 +1915 if not table_columns: +1916 # Try from header file +1917 try: +1918 table_header = self.read_header_file(header_file) +1919 except ValueError: +1920 table_header = None +1921 +1922 if table_header: +1923 try: +1924 table_columns = ( +1925 table_header[self.get_header_length(header_file=table_header)] +1926 .strip() +1927 .split(delimiter) +1928 ) +1929 except IndexError: +1930 table_columns = None +1931 else: +1932 table_columns = None +1933 +1934 return table_columns +1935 +1936 def get_annotations(self, database: str = None) -> object: +1937 """ +1938 This function returns the annotations of a database or the default database if none is +1939 specified. +1940 +1941 :param database: The parameter `database` is a string that represents the name of the database +1942 to retrieve annotations from. If no database name is provided, the method will use the default +1943 database name obtained from the `get_database()` method +1944 :type database: str +1945 :return: The function `get_annotations` returns the `infos` attribute of the header of a +1946 database. If the `database` parameter is not provided, it gets the current database using the +1947 `get_database` method. If there is no header, it returns `None`. +1948 """ +1949 +1950 if not database: +1951 database = self.get_database() +1952 +1953 if self.get_header(database=database): +1954 return self.get_header(database=database).infos +1955 else: +1956 return None +1957 +1958 def get_extra_columns( +1959 self, database: str = None, database_type: str = None, sql_query: str = None +1960 ) -> list: +1961 """ +1962 This Python function returns a list of extra columns in a database table that are not needed +1963 based on the database type and existing columns. +1964 +1965 :param database: A string representing the name of the database to retrieve columns from. If +1966 None is provided, the default database will be used +1967 :type database: str +1968 :param database_type: The `database_type` parameter in the `get_extra_columns` function +1969 represents the type of the database for which you want to retrieve the list of extra columns. It +1970 is used to determine which columns are needed based on the database type and the existing +1971 columns in the specified database table +1972 :type database_type: str +1973 :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass +1974 an SQL query that can be used to retrieve specific columns from the database. This query can be +1975 customized to filter columns based on certain conditions or criteria before analyzing them to +1976 determine the extra columns that are not needed +1977 :type sql_query: str +1978 :return: A list of extra columns in a database table that are not needed based on the database +1979 type and existing columns. +1980 """ +1981 +1982 if not database: +1983 database = self.get_database() +1984 +1985 if not database: +1986 return [] +1987 +1988 existing_columns = self.get_columns( +1989 database=database, +1990 table=self.get_database_table(database), +1991 sql_query=sql_query, +1992 ) +1993 if not database_type: +1994 database_type = self.get_type(database=database, sql_query=sql_query) +1995 needed_columns = self.get_needed_columns( +1996 database_columns=existing_columns, database_type=database_type +1997 ) +1998 +1999 extra_columns = existing_columns.copy() +2000 +2001 for needed_col in needed_columns: +2002 if needed_columns.get(needed_col) in extra_columns: +2003 extra_columns.remove(needed_columns.get(needed_col)) +2004 +2005 return extra_columns +2006 +2007 def is_vcf(self, database: str = None, sql_query: str = None) -> bool: +2008 """ +2009 The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and +2010 their types. +2011 +2012 :param database: The `database` parameter in the `is_vcf` function is a string that represents +2013 the name of the database that the function will use to check if the file is a VCF (Variant Call +2014 Format) file. If the `database` parameter is not provided when calling the function, it will +2015 :type database: str +2016 :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL +2017 query string that can be used to filter the columns retrieved from the database. This query can +2018 be used to narrow down the columns that are considered when checking if the database is of type +2019 "vcf" +2020 :type sql_query: str +2021 :return: The function `is_vcf` returns a boolean value indicating whether the database type is +2022 "vcf" or not. +2023 """ +2024 +2025 if not database: +2026 database = self.get_database() +2027 +2028 if not database: +2029 return False +2030 +2031 database_columns = self.get_columns( +2032 database=database, +2033 table=self.get_database_table(database), +2034 sql_query=sql_query, +2035 ) +2036 +2037 # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples) +2038 return self.get_type_from_columns( +2039 database_columns=database_columns, check_database_type="vcf" +2040 ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8) +2041 +2042 def get_conn(self): +2043 """ +2044 The function returns the connection object. +2045 :return: The method is returning the value of the instance variable `self.conn`. +2046 """ +2047 +2048 return self.conn +2049 +2050 def is_genotype_column( +2051 self, +2052 column: str, +2053 database: str = None, +2054 downsampling: int = 1000, +2055 check_format: bool = True, +2056 ) -> bool: +2057 """ +2058 The `is_genotype_column` function in Python checks if a specified column in a database contains +2059 genotype data based on a regular expression pattern. +2060 +2061 :param column: The `column` parameter is a string that represents the name of a column in a +2062 database table. It is used to specify the column for which you want to check if it contains +2063 genotype information based on a regular expression pattern +2064 :type column: str +2065 :param database: The `database` parameter in the `is_genotype_column` method is used to specify +2066 the name of the database from which the data will be queried. If a database is provided, the +2067 method will query the specified database to check if the given column contains genotype +2068 information. If no database is provided, +2069 :type database: str +2070 :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an +2071 integer value that determines the number of rows to be sampled from the database table when +2072 checking for genotype information in the specified column. This parameter is used to limit the +2073 number of rows to be processed in order to improve performance, defaults to 1000 +2074 :type downsampling: int (optional) +2075 :param check_format: The `check_format` parameter in the `is_genotype_column` method is a +2076 boolean flag that determines whether the function should check the format of the data before +2077 proceeding with the genotype column analysis. If `check_format` is set to `True`, the function +2078 will verify if the specified column exists in, defaults to True +2079 :type check_format: bool (optional) +2080 :return: The `is_genotype_column` method returns a boolean value. If the specified column in a +2081 database table contains genotype information, it returns `True`; otherwise, it returns `False`. +2082 """ +2083 +2084 # Table variants +2085 table_variants_from = self.get_sql_database_link(database=database) +2086 +2087 # Check if format column is present +2088 if check_format: +2089 query = f""" +2090 SELECT * FROM {table_variants_from} +2091 LIMIT 0 +2092 """ +2093 df = self.query(query=query) +2094 if "FORMAT" not in df.columns or column not in df.columns: +2095 return False +2096 query_format = f""" +2097 AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$')) +2098 """ +2099 else: +2100 query_format = "" +2101 +2102 # Query number of samples +2103 query_downsampling = f""" +2104 SELECT "{column}", FORMAT +2105 FROM {table_variants_from} +2106 LIMIT {downsampling} +2107 """ +2108 df_downsampling = self.query(query=query_downsampling) +2109 +2110 # Query to check genotype +2111 query_genotype = f""" +2112 SELECT * +2113 FROM df_downsampling +2114 WHERE ( +2115 regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*') +2116 {query_format} +2117 ) +2118 """ +2119 df_genotype = self.query(query=query_genotype) +2120 +2121 # return +2122 return len(df_genotype) == len(df_downsampling) +2123 +2124 def export( +2125 self, +2126 output_database: str, +2127 output_header: str = None, +2128 header_in_output: bool = True, +2129 database: str = None, +2130 table: str = "variants", +2131 parquet_partitions: list = None, +2132 threads: int = 1, +2133 sort: bool = False, +2134 index: bool = False, +2135 existing_columns_header: list = [], +2136 order_by: str = "", +2137 query: str = None, +2138 compression_type: str = None, +2139 chunk_size: int = 1000000, +2140 export_mode: str = "pyarrow", +2141 compresslevel: int = 6, +2142 export_header: bool = True, +2143 sample_list: list = None, +2144 ) -> bool: +2145 """ +2146 The `export` function exports data from a database to a specified output format, compresses it +2147 if necessary, and returns a boolean value indicating whether the export was successful or not. +2148 +2149 :param output_database: The `output_database` parameter is a string that represents the path and +2150 filename of the output file to be exported. It specifies where the exported data will be saved +2151 :type output_database: str +2152 :param output_header: The `output_header` parameter is an optional string that represents the +2153 header of the output file. If provided, it specifies the header that will be included in the +2154 output file. If not provided, the header will be automatically detected based on the output file +2155 format +2156 :type output_header: str +2157 :param header_in_output: The `header_in_output` parameter is a boolean value that determines +2158 whether the header should be included in the output file. If set to `True`, the header will be +2159 included in the output file. If set to `False`, the header will not be included in the output +2160 file. By default,, defaults to True +2161 :type header_in_output: bool (optional) +2162 :param database: The `database` parameter is the name of the database from which you want to +2163 export data. If this parameter is not provided, the function will use the `get_database()` +2164 method to retrieve the current database +2165 :type database: str +2166 :param table: The `table` parameter specifies the name of the table in the database from which +2167 the data will be exported. By default, if not specified, it is set to "variants", defaults to +2168 variants +2169 :type table: str (optional) +2170 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the +2171 partition columns for the Parquet output format. Each element in the list represents a partition +2172 column. The partitions are used to organize the data in the Parquet file based on the values of +2173 the specified columns +2174 :type parquet_partitions: list +2175 :param threads: The `threads` parameter in the `export` function is an optional integer that +2176 specifies the number of threads to use for exporting the data. It determines the level of +2177 parallelism during the export process. By default, it is set to 1, defaults to 1 +2178 :type threads: int (optional) +2179 :param sort: The `sort` parameter in the `export` function is a boolean value that specifies +2180 whether the output file should be sorted based on the genomic coordinates of the variants. If +2181 `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults +2182 to False +2183 :type sort: bool (optional) +2184 :param index: The `index` parameter is a boolean value that specifies whether to index the +2185 output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to +2186 `False` or not provided, the output file will not be indexed. By default,, defaults to False +2187 :type index: bool (optional) +2188 :param existing_columns_header: The `existing_columns_header` parameter is a list that +2189 represents the existing columns in the header of the output file. It is used to determine the +2190 columns that should be included in the output file. If this parameter is not provided, the +2191 function will automatically detect the header columns based on the output file format +2192 :type existing_columns_header: list +2193 :param order_by: The `order_by` parameter in the `export` function is a string that specifies +2194 the columns by which the output file should be ordered. You can specify multiple columns +2195 separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC" +2196 (descending) to specify +2197 :type order_by: str +2198 :param query: The `query` parameter in the `export` function represents a SQL query that +2199 specifies the data to be exported from the database. If provided, the function will export the +2200 result of this query. If the `query` parameter is not provided, the function will generate a +2201 query to export the data from +2202 :type query: str +2203 :param compression_type: The `compression_type` parameter in the `export` function specifies the +2204 type of compression to be applied to the output file. By default, the compression type is set to +2205 "bgzip". This parameter allows you to choose the compression algorithm for the output file, such +2206 as "gzip", "bgzip +2207 :type compression_type: str +2208 :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of +2209 each chunk or batch of data that will be processed during the export operation. It determines +2210 how many records or lines of data will be included in each chunk that is processed at a time, +2211 defaults to 1000000 +2212 :type chunk_size: int (optional) +2213 :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of +2214 export, which can be either "pyarrow" or "duckdb", defaults to pyarrow +2215 :type export_mode: str (optional) +2216 :param compresslevel: The `compresslevel` parameter in the `export` function represents the +2217 level of compression for gzip. By default, it is set to 6. This parameter allows you to specify +2218 the compression level when using gzip compression for the output file. The compression level can +2219 range from 0 (no compression), defaults to 6 +2220 :type compresslevel: int (optional) +2221 :param export_header: The `export_header` parameter is a boolean flag that determines whether +2222 the header of a VCF file should be exported to a separate file or not. If `export_header` is +2223 True, the header will be exported to a file. If `export_header` is False, the header will not +2224 be, defaults to True +2225 :type export_header: bool (optional) +2226 :param sample_list: The `sample_list` parameter in the `export` function is a list that +2227 specifies the samples to be included in the exported data. If provided, the samples listed in +2228 this parameter will be included in the output file. If not provided, the function will determine +2229 the samples to include based on the data +2230 :type sample_list: list +2231 :return: The `export` function returns a boolean value indicating whether the export was +2232 successful or not. +2233 """ +2234 +2235 # Full path +2236 output_database = full_path(output_database) +2237 output_header = full_path(output_header) +2238 database = full_path(database) +2239 +2240 # Database +2241 if not database: +2242 database = self.get_database() +2243 if not database: +2244 return False +2245 +2246 # Chunk size +2247 if not chunk_size: +2248 chunk_size = 1000000 +2249 else: +2250 chunk_size = int(chunk_size) +2251 +2252 # Export mode +2253 # Either "pyarrow" (default) or "duckdb" +2254 if not export_mode: +2255 export_mode = "pyarrow" +2256 +2257 # Compression level +2258 if not compresslevel: +2259 compresslevel = 6 +2260 +2261 # Remove output if exists +2262 remove_if_exists(output_database) +2263 +2264 # Tmp +2265 tmp_folder = os.path.dirname(output_database) +2266 if not tmp_folder: +2267 tmp_folder = "." 2268 -2269 # Header columns -2270 if not existing_columns_header and output_header: -2271 existing_columns_header = self.get_header_file_columns(output_header) +2269 with TemporaryDirectory( +2270 dir=tmp_folder, prefix="howard_database_export_" +2271 ) as tmp_dir: 2272 -2273 # Auto-detect output type and compression and delimiter -2274 output_type = get_file_format(output_database) -2275 compressed = self.is_compressed(database=output_database) -2276 delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t") -2277 -2278 # database type -2279 if output_type in ["vcf"]: -2280 database_type = "vcf" -2281 elif output_type in ["bed"]: -2282 database_type = "regions" -2283 else: -2284 database_type = self.get_type(database=database, sql_query=query) -2285 -2286 # database object -2287 # If database is string, then create database conn -2288 if isinstance(database, str): -2289 database_conn = Database(database=database).get_conn() -2290 else: -2291 database_conn = database -2292 -2293 # Existing columns -2294 existing_columns = self.get_columns( -2295 database=database_conn, -2296 table=self.get_database_table(database=database), -2297 sql_query=query, -2298 ) -2299 -2300 # Extra columns -2301 extra_columns = self.get_extra_columns( -2302 database=database_conn, database_type=output_type, sql_query=query -2303 ) -2304 -2305 # Needed columns -2306 needed_columns = self.get_needed_columns( -2307 database_columns=existing_columns, database_type=database_type +2273 # tmp files +2274 tmp_files = [] +2275 +2276 # query_set +2277 query_set = "" +2278 +2279 # Header columns +2280 if not existing_columns_header and output_header: +2281 existing_columns_header = self.get_header_file_columns(output_header) +2282 +2283 # Auto-detect output type and compression and delimiter +2284 output_type = get_file_format(output_database) +2285 compressed = self.is_compressed(database=output_database) +2286 delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t") +2287 +2288 # database type +2289 if output_type in ["vcf"]: +2290 database_type = "vcf" +2291 elif output_type in ["bed"]: +2292 database_type = "regions" +2293 else: +2294 database_type = self.get_type(database=database, sql_query=query) +2295 +2296 # database object +2297 # If database is string, then create database conn +2298 if isinstance(database, str): +2299 database_conn = Database(database=database).get_conn() +2300 else: +2301 database_conn = database +2302 +2303 # Existing columns +2304 existing_columns = self.get_columns( +2305 database=database_conn, +2306 table=self.get_database_table(database=database), +2307 sql_query=query, 2308 ) 2309 -2310 # Order by -2311 order_by_list = [] -2312 if order_by: -2313 # Split order by options -2314 order_by_split = order_by.split(",") -2315 for order_by_option in order_by_split: -2316 # Split order by option -2317 order_by_option_split = order_by_option.strip().split(" ") -2318 order_by_option_split_column = order_by_option_split[0] -2319 if len(order_by_option_split) > 1: -2320 order_by_option_split_order = order_by_option_split[1] -2321 else: -2322 order_by_option_split_order = "ASC" -2323 # Chek if column exists -2324 if ( -2325 order_by_option_split_column.replace('"', "").strip() -2326 in existing_columns -2327 ): -2328 order_by_list.append( -2329 f"{order_by_option_split_column} {order_by_option_split_order}" -2330 ) -2331 -2332 # Clean order by -2333 order_by_clean = ", ".join(order_by_list) -2334 -2335 # Query values -2336 default_empty_value = "" -2337 query_export_format = None -2338 include_header = False -2339 post_process = False -2340 order_by_sql = "" -2341 post_process_just_move = False -2342 -2343 # export options -2344 export_options = {} -2345 -2346 # VCF -2347 if output_type in ["vcf"]: -2348 if not self.is_vcf(database=database_conn, sql_query=query): -2349 extra_columns = [] -2350 else: -2351 extra_columns = existing_columns_header +2310 # Extra columns +2311 extra_columns = self.get_extra_columns( +2312 database=database_conn, database_type=output_type, sql_query=query +2313 ) +2314 +2315 # Needed columns +2316 needed_columns = self.get_needed_columns( +2317 database_columns=existing_columns, database_type=database_type +2318 ) +2319 +2320 # Order by +2321 order_by_list = [] +2322 if order_by: +2323 # Split order by options +2324 order_by_split = order_by.split(",") +2325 for order_by_option in order_by_split: +2326 # Split order by option +2327 order_by_option_split = order_by_option.strip().split(" ") +2328 order_by_option_split_column = order_by_option_split[0] +2329 if len(order_by_option_split) > 1: +2330 order_by_option_split_order = order_by_option_split[1] +2331 else: +2332 order_by_option_split_order = "ASC" +2333 # Chek if column exists +2334 if ( +2335 order_by_option_split_column.replace('"', "").strip() +2336 in existing_columns +2337 ): +2338 order_by_list.append( +2339 f"{order_by_option_split_column} {order_by_option_split_order}" +2340 ) +2341 +2342 # Clean order by +2343 order_by_clean = ", ".join(order_by_list) +2344 +2345 # Query values +2346 default_empty_value = "" +2347 query_export_format = None +2348 include_header = False +2349 post_process = False +2350 order_by_sql = "" +2351 post_process_just_move = False 2352 -2353 # Check VCF format with extra columns -2354 extra_columns_clean = [] +2353 # export options +2354 export_options = {} 2355 -2356 # Force samples list in parameter -2357 if sample_list: -2358 if "FORMAT" in extra_columns: -2359 extra_columns = ["FORMAT"] + sample_list -2360 else: -2361 extra_columns = sample_list +2356 # VCF +2357 if output_type in ["vcf"]: +2358 if not self.is_vcf(database=database_conn, sql_query=query): +2359 extra_columns = [] +2360 else: +2361 extra_columns = existing_columns_header 2362 -2363 # Check columns -2364 else: -2365 for extra_column in extra_columns: -2366 if extra_column not in needed_columns and ( -2367 extra_column == "FORMAT" -2368 or ( -2369 "FORMAT" in extra_columns_clean -2370 and self.is_genotype_column( -2371 database=database, column=extra_column -2372 ) -2373 ) -2374 ): -2375 extra_columns_clean.append(extra_column) -2376 extra_columns = extra_columns_clean -2377 -2378 default_empty_value = "." -2379 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'" -2380 include_header = True -2381 post_process = True -2382 # Export options -2383 if not compression_type: -2384 compression_type = "bgzip" -2385 export_options = { -2386 "format": "CSV", -2387 "delimiter": delimiter, -2388 "header": True, -2389 "quote": None, -2390 "compression": compression_type, -2391 } -2392 compresslevel = 1 -2393 -2394 # TSV/CSV/TBL -2395 elif output_type in ["tsv", "csv", "tbl"]: -2396 if output_type in ["csv", "tbl"]: -2397 quote = '"' -2398 else: -2399 quote = "" -2400 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'" -2401 if delimiter in ["\t"]: -2402 include_header = header_in_output and True -2403 post_process = True -2404 if order_by_clean: -2405 order_by_sql = f"ORDER BY {order_by_clean}" -2406 # Export options -2407 if not compression_type: -2408 compression_type = "gzip" -2409 export_options = { -2410 "format": "CSV", -2411 "delimiter": delimiter, -2412 "header": True, -2413 "quote": quote, -2414 "compression": compression_type, -2415 } -2416 -2417 # JSON -2418 elif output_type in ["json"]: -2419 query_export_format = "FORMAT JSON, ARRAY TRUE" -2420 include_header = False -2421 post_process = True -2422 if order_by_clean: -2423 order_by_sql = f"ORDER BY {order_by_clean}" -2424 # Export options -2425 if not compression_type: -2426 compression_type = "gzip" -2427 export_options = { -2428 "format": "JSON", -2429 "array": True, -2430 "compression": compression_type, -2431 } -2432 -2433 # Parquet -2434 elif output_type in ["parquet"]: -2435 query_export_format = "FORMAT PARQUET" -2436 # Export options +2363 # Check VCF format with extra columns +2364 extra_columns_clean = [] +2365 +2366 # Force samples list in parameter +2367 if sample_list: +2368 if "FORMAT" in extra_columns: +2369 extra_columns = ["FORMAT"] + sample_list +2370 else: +2371 extra_columns = sample_list +2372 +2373 # Check columns +2374 else: +2375 for extra_column in extra_columns: +2376 if extra_column not in needed_columns and ( +2377 extra_column == "FORMAT" +2378 or ( +2379 "FORMAT" in extra_columns_clean +2380 and self.is_genotype_column( +2381 database=database, column=extra_column +2382 ) +2383 ) +2384 ): +2385 extra_columns_clean.append(extra_column) +2386 extra_columns = extra_columns_clean +2387 +2388 default_empty_value = "." +2389 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'" +2390 include_header = True +2391 post_process = True +2392 # Export options +2393 if not compression_type: +2394 compression_type = "bgzip" +2395 export_options = { +2396 "format": "CSV", +2397 "delimiter": delimiter, +2398 "header": True, +2399 "quote": None, +2400 "compression": compression_type, +2401 } +2402 compresslevel = 1 +2403 +2404 # TSV/CSV/TBL +2405 elif output_type in ["tsv", "csv", "tbl"]: +2406 if output_type in ["csv", "tbl"]: +2407 quote = '"' +2408 else: +2409 quote = "" +2410 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'" +2411 if delimiter in ["\t"]: +2412 include_header = header_in_output and True +2413 post_process = True +2414 if order_by_clean: +2415 order_by_sql = f"ORDER BY {order_by_clean}" +2416 # Export options +2417 if not compression_type: +2418 compression_type = "gzip" +2419 export_options = { +2420 "format": "CSV", +2421 "delimiter": delimiter, +2422 "header": True, +2423 "quote": quote, +2424 "compression": compression_type, +2425 } +2426 +2427 # JSON +2428 elif output_type in ["json"]: +2429 query_export_format = "FORMAT JSON, ARRAY TRUE" +2430 include_header = False +2431 post_process = True +2432 if order_by_clean: +2433 order_by_sql = f"ORDER BY {order_by_clean}" +2434 # Export options +2435 if not compression_type: +2436 compression_type = "gzip" 2437 export_options = { -2438 "format": "PARQUET", -2439 } -2440 include_header = False -2441 post_process = True +2438 "format": "JSON", +2439 "array": True, +2440 "compression": compression_type, +2441 } 2442 -2443 # BED -2444 elif output_type in ["bed"]: -2445 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER" -2446 include_header = True -2447 post_process = True -2448 if order_by_clean: -2449 order_by_sql = f"ORDER BY {order_by_clean}" -2450 # Export options -2451 if not compression_type: -2452 compression_type = "gzip" -2453 export_options = { -2454 "format": "CSV", -2455 "delimiter": delimiter, -2456 "header": True, -2457 "quote": None, -2458 "compression": compression_type, -2459 } -2460 -2461 # duckDB -2462 elif output_type in ["duckdb"]: -2463 -2464 # Needed column -2465 needed_columns = [] -2466 -2467 # Export database as Parquet -2468 database_export_parquet_file = os.path.join(tmp_dir, "output.parquet") -2469 self.export( -2470 database=database, output_database=database_export_parquet_file -2471 ) -2472 -2473 # Create database and connexion -2474 output_database_conn = duckdb.connect(output_database) -2475 -2476 # Create table in database connexion with Parquet file -2477 query_copy = f""" -2478 CREATE TABLE {table} -2479 AS {self.get_sql_database_link(database=database_export_parquet_file)} -2480 """ -2481 output_database_conn.execute(query_copy) +2443 # Parquet +2444 elif output_type in ["parquet"]: +2445 query_export_format = "FORMAT PARQUET" +2446 # Export options +2447 export_options = { +2448 "format": "PARQUET", +2449 } +2450 include_header = False +2451 post_process = True +2452 +2453 # BED +2454 elif output_type in ["bed"]: +2455 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER" +2456 include_header = True +2457 post_process = True +2458 if order_by_clean: +2459 order_by_sql = f"ORDER BY {order_by_clean}" +2460 # Export options +2461 if not compression_type: +2462 compression_type = "gzip" +2463 export_options = { +2464 "format": "CSV", +2465 "delimiter": delimiter, +2466 "header": True, +2467 "quote": None, +2468 "compression": compression_type, +2469 } +2470 +2471 # duckDB +2472 elif output_type in ["duckdb"]: +2473 +2474 # Needed column +2475 needed_columns = [] +2476 +2477 # Export database as Parquet +2478 database_export_parquet_file = os.path.join(tmp_dir, "output.parquet") +2479 self.export( +2480 database=database, output_database=database_export_parquet_file +2481 ) 2482 -2483 # Close connexion -2484 output_database_conn.close() +2483 # Create database and connexion +2484 output_database_conn = duckdb.connect(output_database) 2485 -2486 # remove tmp -2487 remove_if_exists([database_export_parquet_file]) -2488 -2489 return os.path.exists(output_database) -2490 -2491 # Partition -2492 if parquet_partitions: -2493 parquet_partitions_clean = [] -2494 parquet_partitions_array = [] -2495 for parquet_partition in parquet_partitions: -2496 parquet_partitions_array.append( -2497 parquet_partition.translate({'"': None, "'": None, " ": None}) -2498 ) -2499 parquet_partitions_clean.append( -2500 '"' -2501 + parquet_partition.translate({'"': None, "'": None, " ": None}) -2502 + '"' -2503 ) -2504 parquet_partitions_by = ",".join(parquet_partitions_clean) -2505 query_export_format += ( -2506 f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE" -2507 ) -2508 export_options["partition_by"] = parquet_partitions_array -2509 if export_options.get("format", None) == "CSV": -2510 export_mode = "duckdb" -2511 post_process_just_move = True -2512 -2513 # Construct query columns -2514 query_columns = [] -2515 -2516 # Add Needed columns -2517 for needed_column in needed_columns: -2518 if needed_columns[needed_column]: -2519 query_column_name = needed_columns[needed_column] -2520 query_column = f""" "{needed_columns[needed_column]}" """ -2521 else: -2522 query_column_name = default_empty_value -2523 query_column = f""" '{default_empty_value}' """ -2524 query_column_as = f""" "{needed_column}" """ -2525 if query_column_name == needed_column: -2526 query_columns.append(f""" {query_column} """) -2527 else: -2528 query_columns.append(f""" {query_column} AS {query_column_as} """) -2529 -2530 # Add Extra columns -2531 for extra_column in extra_columns: -2532 if extra_column not in needed_columns: -2533 # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """) -2534 query_columns.append(f""" "{extra_column}" """) -2535 -2536 # Query export columns -2537 query_export_columns = f""" {",".join(query_columns)} """ -2538 -2539 if query_columns: -2540 -2541 # Compressed tmp file -2542 query_output_database_tmp = os.path.join(tmp_dir, "output") -2543 -2544 # Query -2545 # If no query, generate query of the database -2546 if not query: -2547 query = f""" -2548 SELECT {query_export_columns} -2549 FROM {self.get_sql_database_link(database=database)} -2550 {order_by_sql} -2551 """ -2552 -2553 # Test empty query -2554 df = self.conn.execute(query).fetch_record_batch(1) -2555 query_empty = True -2556 for d in df: -2557 query_empty = False -2558 break -2559 if query_empty: -2560 log.error("Export failed: Empty") -2561 raise ValueError("Export failed: Empty") +2486 # Create table in database connexion with Parquet file +2487 query_copy = f""" +2488 CREATE TABLE {table} +2489 AS {self.get_sql_database_link(database=database_export_parquet_file)} +2490 """ +2491 output_database_conn.execute(query_copy) +2492 +2493 # Close connexion +2494 output_database_conn.close() +2495 +2496 # remove tmp +2497 remove_if_exists([database_export_parquet_file]) +2498 +2499 return os.path.exists(output_database) +2500 +2501 # Partition +2502 if parquet_partitions: +2503 parquet_partitions_clean = [] +2504 parquet_partitions_array = [] +2505 for parquet_partition in parquet_partitions: +2506 parquet_partitions_array.append( +2507 parquet_partition.translate({'"': None, "'": None, " ": None}) +2508 ) +2509 parquet_partitions_clean.append( +2510 '"' +2511 + parquet_partition.translate({'"': None, "'": None, " ": None}) +2512 + '"' +2513 ) +2514 parquet_partitions_by = ",".join(parquet_partitions_clean) +2515 query_export_format += ( +2516 f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE" +2517 ) +2518 export_options["partition_by"] = parquet_partitions_array +2519 if export_options.get("format", None) == "CSV": +2520 export_mode = "duckdb" +2521 post_process_just_move = True +2522 +2523 # Construct query columns +2524 query_columns = [] +2525 +2526 # Add Needed columns +2527 for needed_column in needed_columns: +2528 if needed_columns[needed_column]: +2529 query_column_name = needed_columns[needed_column] +2530 query_column = f""" "{needed_columns[needed_column]}" """ +2531 else: +2532 query_column_name = default_empty_value +2533 query_column = f""" '{default_empty_value}' """ +2534 query_column_as = f""" "{needed_column}" """ +2535 if query_column_name == needed_column: +2536 query_columns.append(f""" {query_column} """) +2537 else: +2538 query_columns.append(f""" {query_column} AS {query_column_as} """) +2539 +2540 # Add Extra columns +2541 for extra_column in extra_columns: +2542 if extra_column not in needed_columns: +2543 # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """) +2544 query_columns.append(f""" "{extra_column}" """) +2545 +2546 # Query export columns +2547 query_export_columns = f""" {",".join(query_columns)} """ +2548 +2549 if query_columns: +2550 +2551 # Compressed tmp file +2552 query_output_database_tmp = os.path.join(tmp_dir, "output") +2553 +2554 # Query +2555 # If no query, generate query of the database +2556 if not query: +2557 query = f""" +2558 SELECT {query_export_columns} +2559 FROM {self.get_sql_database_link(database=database)} +2560 {order_by_sql} +2561 """ 2562 -2563 # Schema names -2564 schema_names = None -2565 -2566 # Export mode pyarrow -2567 if export_mode == "pyarrow": -2568 -2569 # Compression mode -2570 # If compress required and compression type as gzip or bgzip -2571 # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1) -2572 # to reduce tmp file size -2573 compression_mode_gzip = compressed and ( -2574 export_options.get("compression", None) in ["gzip", "bgzip"] -2575 ) -2576 -2577 # File stream mode (str or bytes) -2578 f_mode = "" -2579 if compression_mode_gzip: -2580 f_mode = "b" -2581 -2582 if include_header: -2583 -2584 # Open stream files (uncompressed and compressed) -2585 with open(query_output_database_tmp, mode="w") as f, pgzip.open( -2586 query_output_database_tmp, -2587 mode="w", -2588 thread=threads, -2589 compresslevel=compresslevel, -2590 ) as f_gz: -2591 -2592 # Switch to compressed stream file -2593 if compression_mode_gzip: -2594 f = f_gz +2563 # Test empty query +2564 df = self.conn.execute(query).fetch_record_batch(1) +2565 query_empty = True +2566 for d in df: +2567 query_empty = False +2568 break +2569 if query_empty: +2570 log.warning("Export warning: Empty") +2571 remove_header_line = False +2572 else: +2573 remove_header_line = True +2574 +2575 # Schema names +2576 schema_names = None +2577 +2578 # Export mode pyarrow +2579 if export_mode == "pyarrow": +2580 +2581 # Compression mode +2582 # If compress required and compression type as gzip or bgzip +2583 # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1) +2584 # to reduce tmp file size +2585 compression_mode_gzip = compressed and ( +2586 export_options.get("compression", None) in ["gzip", "bgzip"] +2587 ) +2588 +2589 # File stream mode (str or bytes) +2590 f_mode = "" +2591 if compression_mode_gzip: +2592 f_mode = "b" +2593 +2594 if include_header: 2595 -2596 # Generate header tmp file -2597 query_output_header_tmp = os.path.join(tmp_dir, "header") -2598 self.get_header_file( -2599 header_file=query_output_header_tmp, -2600 remove_header_line=True, -2601 sql_query=query, -2602 ) +2596 # Open stream files (uncompressed and compressed) +2597 with open(query_output_database_tmp, mode="w") as f, pgzip.open( +2598 query_output_database_tmp, +2599 mode="w", +2600 thread=threads, +2601 compresslevel=compresslevel, +2602 ) as f_gz: 2603 -2604 # Write header to tmp file -2605 with open( -2606 query_output_header_tmp, "r" + f_mode -2607 ) as output_header_tmp: -2608 f.write(output_header_tmp.read()) -2609 -2610 # JSON format - Add special "[" character at the beginning of the file -2611 if export_options.get("format") in ["JSON"]: -2612 -2613 # Open stream files (uncompressed and compressed) -2614 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2615 query_output_database_tmp, -2616 mode="a", -2617 thread=threads, -2618 compresslevel=compresslevel, -2619 ) as f_gz: -2620 -2621 # Switch to compressed stream file -2622 if compression_mode_gzip: -2623 f = f_gz -2624 f.write(b"[\n") -2625 else: -2626 f.write("[\n") -2627 -2628 # Open stream files (uncompressed and compressed) for chunk -2629 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2630 query_output_database_tmp, -2631 mode="a", -2632 thread=threads, -2633 compresslevel=compresslevel, -2634 ) as f_gz: -2635 -2636 # Switch to compressed stream file -2637 if compression_mode_gzip: -2638 f = f_gz +2604 # Switch to compressed stream file +2605 if compression_mode_gzip: +2606 f = f_gz +2607 +2608 # Generate header tmp file +2609 query_output_header_tmp = os.path.join(tmp_dir, "header") +2610 self.get_header_file( +2611 header_file=query_output_header_tmp, +2612 remove_header_line=remove_header_line, +2613 sql_query=query, +2614 ) +2615 +2616 # Write header to tmp file +2617 with open( +2618 query_output_header_tmp, "r" + f_mode +2619 ) as output_header_tmp: +2620 f.write(output_header_tmp.read()) +2621 +2622 # JSON format - Add special "[" character at the beginning of the file +2623 if export_options.get("format") in ["JSON"]: +2624 +2625 # Open stream files (uncompressed and compressed) +2626 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2627 query_output_database_tmp, +2628 mode="a", +2629 thread=threads, +2630 compresslevel=compresslevel, +2631 ) as f_gz: +2632 +2633 # Switch to compressed stream file +2634 if compression_mode_gzip: +2635 f = f_gz +2636 f.write(b"[\n") +2637 else: +2638 f.write("[\n") 2639 -2640 # Chunk query with batch of dataframes of chunk_size -2641 df = self.conn.execute(query).fetch_record_batch(chunk_size) -2642 -2643 # id of chunk -2644 i = 0 -2645 -2646 # For each chunk dataframe -2647 for d in df: -2648 -2649 # Schema names -2650 schema_names = d.schema.names +2640 # Open stream files (uncompressed and compressed) for chunk +2641 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2642 query_output_database_tmp, +2643 mode="a", +2644 thread=threads, +2645 compresslevel=compresslevel, +2646 ) as f_gz: +2647 +2648 # Switch to compressed stream file +2649 if compression_mode_gzip: +2650 f = f_gz 2651 -2652 # id of chunk -2653 i += 1 +2652 # Chunk query with batch of dataframes of chunk_size +2653 df = self.conn.execute(query).fetch_record_batch(chunk_size) 2654 -2655 # Log - number of records -2656 log.debug(f"Chunk {i}: records process...") +2655 # id of chunk +2656 i = 0 2657 -2658 # Check process for first chunk -2659 if i == 1: +2658 # For each chunk dataframe +2659 for d in df: 2660 -2661 # If include header in file -2662 header = export_options.get("header", True) +2661 # Schema names +2662 schema_names = d.schema.names 2663 -2664 # Parquet output format -2665 # Either a folder or a writer -2666 if export_options.get("format") in ["PARQUET"]: -2667 -2668 # For Parquet with multiple file - folder -2669 if export_options.get( -2670 "partition_by", None -2671 ) or export_options.get("per_thread_output", False): -2672 query_output_database_tmp = ( -2673 f"{query_output_database_tmp}.parquet" -2674 ) +2664 # id of chunk +2665 i += 1 +2666 +2667 # Log - number of records +2668 log.debug(f"Chunk {i}: records process...") +2669 +2670 # Check process for first chunk +2671 if i == 1: +2672 +2673 # If include header in file +2674 header = export_options.get("header", True) 2675 -2676 # For Parquet as a unique file - writer -2677 else: -2678 writer = pq.ParquetWriter( -2679 query_output_database_tmp, d.schema -2680 ) -2681 -2682 else: -2683 -2684 # Switch of header in file for not first chunk -2685 header = False -2686 -2687 # CSV format -2688 if export_options.get("format") in ["CSV"]: -2689 -2690 # With quote option -2691 if export_options.get("quote", None): -2692 -2693 # Polars write dataframe -2694 pl.from_arrow(d).write_csv( -2695 file=f, -2696 separator=export_options.get("delimiter", ""), -2697 include_header=header, -2698 quote_char=export_options.get("quote", '"'), -2699 ) -2700 -2701 # Without quote option -2702 else: -2703 -2704 # Polars write dataframe -2705 pl.from_arrow(d).write_csv( -2706 file=f, -2707 separator=export_options.get("delimiter", ""), -2708 include_header=header, -2709 quote_style="never", -2710 ) -2711 -2712 # JSON format -2713 elif export_options.get("format") in ["JSON"]: -2714 -2715 # Compressed mode gzip -2716 if compression_mode_gzip: -2717 -2718 # Add comma at the beginning of dataframe (if not the first one) in bytes mode -2719 if i > 1: -2720 f.write(b",\n") -2721 -2722 # Write dataframe in bytes mode -2723 f.write( -2724 str.encode( -2725 pl.from_arrow(d) -2726 .write_ndjson() -2727 .replace("\n{", ",\n{") -2728 .replace("[", "") -2729 .replace("]", "") -2730 ) -2731 ) -2732 -2733 # Not compressed mode gzip (string mode) -2734 else: -2735 -2736 # Add comma at the beginning of dataframe (if not the first one) in string mode -2737 if i > 1: -2738 f.write(",\n") -2739 -2740 # Write dataframe in string mode -2741 f.write( -2742 pl.from_arrow(d) -2743 .write_ndjson() -2744 .replace("\n{", ",\n{") -2745 .replace("[", "") -2746 .replace("]", "") -2747 ) -2748 -2749 # Parquet format -2750 elif export_options.get("format") in ["PARQUET"]: +2676 # Parquet output format +2677 # Either a folder or a writer +2678 if export_options.get("format") in ["PARQUET"]: +2679 +2680 # For Parquet with multiple file - folder +2681 if export_options.get( +2682 "partition_by", None +2683 ) or export_options.get("per_thread_output", False): +2684 query_output_database_tmp = ( +2685 f"{query_output_database_tmp}.parquet" +2686 ) +2687 +2688 # For Parquet as a unique file - writer +2689 else: +2690 writer = pq.ParquetWriter( +2691 query_output_database_tmp, d.schema +2692 ) +2693 +2694 else: +2695 +2696 # Switch of header in file for not first chunk +2697 header = False +2698 +2699 # CSV format +2700 if export_options.get("format") in ["CSV"]: +2701 +2702 # With quote option +2703 if export_options.get("quote", None): +2704 +2705 # Polars write dataframe +2706 pl.from_arrow(d).write_csv( +2707 file=f, +2708 separator=export_options.get("delimiter", ""), +2709 include_header=header, +2710 quote_char=export_options.get("quote", '"'), +2711 ) +2712 +2713 # Without quote option +2714 else: +2715 +2716 # Polars write dataframe +2717 pl.from_arrow(d).write_csv( +2718 file=f, +2719 separator=export_options.get("delimiter", ""), +2720 include_header=header, +2721 quote_style="never", +2722 ) +2723 +2724 # JSON format +2725 elif export_options.get("format") in ["JSON"]: +2726 +2727 # Compressed mode gzip +2728 if compression_mode_gzip: +2729 +2730 # Add comma at the beginning of dataframe (if not the first one) in bytes mode +2731 if i > 1: +2732 f.write(b",\n") +2733 +2734 # Write dataframe in bytes mode +2735 f.write( +2736 str.encode( +2737 pl.from_arrow(d) +2738 .write_ndjson() +2739 .replace("\n{", ",\n{") +2740 .replace("[", "") +2741 .replace("]", "") +2742 ) +2743 ) +2744 +2745 # Not compressed mode gzip (string mode) +2746 else: +2747 +2748 # Add comma at the beginning of dataframe (if not the first one) in string mode +2749 if i > 1: +2750 f.write(",\n") 2751 -2752 # Partition by fields -2753 partition_by = export_options.get("partition_by", None) -2754 -2755 if partition_by: -2756 -2757 # For No partition but split parquet files into a folder -2758 if "None" in partition_by: -2759 partition_by = None +2752 # Write dataframe in string mode +2753 f.write( +2754 pl.from_arrow(d) +2755 .write_ndjson() +2756 .replace("\n{", ",\n{") +2757 .replace("[", "") +2758 .replace("]", "") +2759 ) 2760 -2761 # Pyarrow write -2762 pq.write_to_dataset( -2763 pa.Table.from_batches([d]), -2764 query_output_database_tmp, -2765 partition_cols=partition_by, -2766 use_threads=threads, -2767 existing_data_behavior="overwrite_or_ignore", -2768 ) -2769 -2770 # Parquet in unique file -2771 else: -2772 writer.write_batch(d) -2773 -2774 # Close Parquet writer -2775 if ( -2776 export_options.get("format") in ["PARQUET"] -2777 and not export_options.get("partition_by", None) -2778 and not export_options.get("per_thread_output", None) -2779 ): -2780 writer.close() +2761 # Parquet format +2762 elif export_options.get("format") in ["PARQUET"]: +2763 +2764 # Partition by fields +2765 partition_by = export_options.get("partition_by", None) +2766 +2767 if partition_by: +2768 +2769 # For No partition but split parquet files into a folder +2770 if "None" in partition_by: +2771 partition_by = None +2772 +2773 # Pyarrow write +2774 pq.write_to_dataset( +2775 pa.Table.from_batches([d]), +2776 query_output_database_tmp, +2777 partition_cols=partition_by, +2778 use_threads=threads, +2779 existing_data_behavior="overwrite_or_ignore", +2780 ) 2781 -2782 # JSON format - Add special "]" character at the end of the file -2783 if export_options.get("format") in ["JSON"]: -2784 -2785 # Open stream files (uncompressed and compressed) -2786 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2787 query_output_database_tmp, -2788 mode="a", -2789 thread=threads, -2790 compresslevel=compresslevel, -2791 ) as f_gz: -2792 -2793 # Switch to compressed stream file -2794 if compression_mode_gzip: -2795 f = f_gz -2796 f.write(b"]\n") -2797 else: -2798 f.write("]\n") -2799 -2800 # Export mode duckdb -2801 elif export_mode == "duckdb": -2802 -2803 # Create COPY TO query -2804 query_copy = f""" -2805 {query_set} -2806 COPY ( -2807 {query} -2808 ) -2809 TO '{query_output_database_tmp}' -2810 WITH ({query_export_format}) -2811 """ -2812 -2813 # Export with duckdb -2814 self.query(query=query_copy) -2815 -2816 # Export mode unknown -2817 else: -2818 log.error(f"Export mode '{export_mode}' unknown") -2819 raise ValueError(f"Export mode '{export_mode}' unknown") -2820 -2821 # Post process -2822 if post_process: -2823 -2824 # Log - number of records -2825 log.debug("Post processing...") -2826 -2827 # Input files -2828 input_files = [] -2829 -2830 # Export mode duckdb and include header -2831 if export_mode == "duckdb" and include_header: +2782 # Parquet in unique file +2783 else: +2784 writer.write_batch(d) +2785 +2786 # Close Parquet writer +2787 if ( +2788 export_options.get("format") in ["PARQUET"] +2789 and not export_options.get("partition_by", None) +2790 and not export_options.get("per_thread_output", None) +2791 ): +2792 writer.close() +2793 +2794 # JSON format - Add special "]" character at the end of the file +2795 if export_options.get("format") in ["JSON"]: +2796 +2797 # Open stream files (uncompressed and compressed) +2798 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2799 query_output_database_tmp, +2800 mode="a", +2801 thread=threads, +2802 compresslevel=compresslevel, +2803 ) as f_gz: +2804 +2805 # Switch to compressed stream file +2806 if compression_mode_gzip: +2807 f = f_gz +2808 f.write(b"]\n") +2809 else: +2810 f.write("]\n") +2811 +2812 # Export mode duckdb +2813 elif export_mode == "duckdb": +2814 +2815 # Create COPY TO query +2816 query_copy = f""" +2817 {query_set} +2818 COPY ( +2819 {query} +2820 ) +2821 TO '{query_output_database_tmp}' +2822 WITH ({query_export_format}) +2823 """ +2824 +2825 # Export with duckdb +2826 self.query(query=query_copy) +2827 +2828 # Export mode unknown +2829 else: +2830 log.error(f"Export mode '{export_mode}' unknown") +2831 raise ValueError(f"Export mode '{export_mode}' unknown") 2832 -2833 # create tmp header file -2834 query_output_header_tmp = os.path.join(tmp_dir, "header") -2835 tmp_files.append(query_output_header_tmp) -2836 self.get_header_file( -2837 header_file=query_output_header_tmp, remove_header_line=True -2838 ) -2839 -2840 # Add tmp header file for concat and compress -2841 input_files.append(query_output_header_tmp) -2842 -2843 # Add variants file -2844 input_files.append(query_output_database_tmp) -2845 -2846 # Output with concat and compress -2847 if not post_process_just_move and ( -2848 export_mode == "duckdb" -2849 or ( -2850 compressed -2851 and export_options.get("compression", None) == "bgzip" -2852 ) -2853 or sort -2854 or index -2855 ): -2856 -2857 # Compression type -2858 if not compressed: -2859 compression_type = "none" -2860 else: -2861 compression_type = export_options.get( -2862 "compression", "bgzip" -2863 ) -2864 -2865 # Concat and compress -2866 concat_and_compress_files( -2867 input_files=input_files, -2868 output_file=output_database, -2869 compression_type=compression_type, -2870 threads=threads, -2871 sort=sort, -2872 index=index, -2873 ) -2874 -2875 # Output already generated file (either compressed in gzip or not compressed, with included header if needed) -2876 else: -2877 -2878 # Move tmp file -2879 shutil.move(query_output_database_tmp, output_database) -2880 -2881 # Generate associated header file -2882 if output_header and export_header: -2883 -2884 # Log - Generate header -2885 log.debug("Generate header...") +2833 # Post process +2834 if post_process: +2835 +2836 # Log - number of records +2837 log.debug("Post processing...") +2838 +2839 # Input files +2840 input_files = [] +2841 +2842 # Export mode duckdb and include header +2843 if export_mode == "duckdb" and include_header: +2844 +2845 # create tmp header file +2846 query_output_header_tmp = os.path.join(tmp_dir, "header") +2847 tmp_files.append(query_output_header_tmp) +2848 self.get_header_file( +2849 header_file=query_output_header_tmp, remove_header_line=remove_header_line +2850 ) +2851 +2852 # Add tmp header file for concat and compress +2853 input_files.append(query_output_header_tmp) +2854 +2855 # Add variants file +2856 input_files.append(query_output_database_tmp) +2857 +2858 # Output with concat and compress +2859 if not post_process_just_move and ( +2860 export_mode == "duckdb" +2861 or ( +2862 compressed +2863 and export_options.get("compression", None) == "bgzip" +2864 ) +2865 or sort +2866 or index +2867 ): +2868 +2869 # Compression type +2870 if not compressed: +2871 compression_type = "none" +2872 else: +2873 compression_type = export_options.get( +2874 "compression", "bgzip" +2875 ) +2876 +2877 # Concat and compress +2878 concat_and_compress_files( +2879 input_files=input_files, +2880 output_file=output_database, +2881 compression_type=compression_type, +2882 threads=threads, +2883 sort=sort, +2884 index=index, +2885 ) 2886 -2887 # Create database -2888 database_for_header = Database(database=output_database) +2887 # Output already generated file (either compressed in gzip or not compressed, with included header if needed) +2888 else: 2889 -2890 # Remove header if exists -2891 remove_if_exists([output_header]) +2890 # Move tmp file +2891 shutil.move(query_output_database_tmp, output_database) 2892 -2893 # Find columns in database -2894 if schema_names: -2895 header_columns_from_database = schema_names -2896 else: -2897 header_columns_from_database = ( -2898 database_for_header.get_header_columns_from_database( -2899 database=output_database -2900 ) -2901 ) -2902 -2903 # Generate header file -2904 database_for_header.get_header_file( -2905 header_file=output_header, -2906 replace_header_line=header_columns_from_database, -2907 force=True, -2908 ) -2909 -2910 # Clean tmp files (deprecated) -2911 remove_if_exists(tmp_files) -2912 -2913 # Return if file exists -2914 return os.path.exists(output_database) and self.get_type(output_database) +2893 # Generate associated header file +2894 if output_header and export_header: +2895 +2896 # Log - Generate header +2897 log.debug("Generate header...") +2898 +2899 # Create database +2900 database_for_header = Database(database=output_database) +2901 +2902 # Remove header if exists +2903 remove_if_exists([output_header]) +2904 +2905 # Find columns in database +2906 if schema_names: +2907 header_columns_from_database = schema_names +2908 else: +2909 header_columns_from_database = ( +2910 database_for_header.get_header_columns_from_database( +2911 database=output_database +2912 ) +2913 ) +2914 +2915 # Generate header file +2916 database_for_header.get_header_file( +2917 header_file=output_header, +2918 replace_header_line=header_columns_from_database, +2919 force=True, +2920 ) +2921 +2922 # Clean tmp files (deprecated) +2923 remove_if_exists(tmp_files) +2924 +2925 # Return if file exists +2926 return os.path.exists(output_database) and self.get_type(output_database) @@ -4980,1109 +4992,1121 @@

    1810 "bed", 1811 "json", 1812 ]: -1813 sql_from = self.get_sql_from( -1814 database=database, header_file=header_file -1815 ) -1816 sql_query = f"SELECT * FROM {sql_from} LIMIT 0" -1817 return list(self.conn.query(sql_query).columns) -1818 except ValueError: -1819 return [] -1820 -1821 return [] -1822 -1823 def get_table_columns_from_format(self, database: str = None) -> list: -1824 """ -1825 The function `get_table_columns_from_format` returns a list of table columns based on the -1826 specified database format. +1813 # Query +1814 sql_from = self.get_sql_from( +1815 database=database, header_file=header_file +1816 ) +1817 sql_query = f"SELECT * FROM {sql_from} LIMIT 0" +1818 +1819 # Get columns +1820 result_description = self.conn.execute(sql_query).description +1821 +1822 # Extract columns' names +1823 columns = [desc[0] for desc in result_description] +1824 +1825 # Return columns as list +1826 return columns 1827 -1828 :param database: The `database` parameter is a string that represents the name of the database. -1829 It is an optional parameter, which means it has a default value of `None`. If no value is -1830 provided for the `database` parameter, the `get_database()` method is called to retrieve the -1831 current database name -1832 :type database: str -1833 :return: a list of table columns. -1834 """ -1835 -1836 table_columns = None +1828 except ValueError: +1829 return [] +1830 +1831 return [] +1832 +1833 def get_table_columns_from_format(self, database: str = None) -> list: +1834 """ +1835 The function `get_table_columns_from_format` returns a list of table columns based on the +1836 specified database format. 1837 -1838 if not database: -1839 database = self.get_database() -1840 -1841 database_format = self.get_format(database) -1842 -1843 needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None) -1844 if needed_columns: -1845 table_columns = list(needed_columns.keys()) -1846 else: -1847 table_columns = [] -1848 -1849 return table_columns +1838 :param database: The `database` parameter is a string that represents the name of the database. +1839 It is an optional parameter, which means it has a default value of `None`. If no value is +1840 provided for the `database` parameter, the `get_database()` method is called to retrieve the +1841 current database name +1842 :type database: str +1843 :return: a list of table columns. +1844 """ +1845 +1846 table_columns = None +1847 +1848 if not database: +1849 database = self.get_database() 1850 -1851 def get_table_columns_from_file( -1852 self, -1853 database: str = None, -1854 header_file: str = None, -1855 header_file_find: bool = True, -1856 ) -> list: -1857 """ -1858 The function `get_table_columns_from_file` retrieves the column names from a database or header -1859 file. +1851 database_format = self.get_format(database) +1852 +1853 needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None) +1854 if needed_columns: +1855 table_columns = list(needed_columns.keys()) +1856 else: +1857 table_columns = [] +1858 +1859 return table_columns 1860 -1861 :param database: The `database` parameter is a string that represents the name or path of the -1862 database file. If this parameter is not provided, the `get_database()` method is called to -1863 retrieve the database name or path -1864 :type database: str -1865 :param header_file: The `header_file` parameter is a string that represents the file path or -1866 name of the header file. This file contains the header information for a table, which typically -1867 includes the names of the columns in the table -1868 :type header_file: str -1869 :param header_file_find: Allow header file find if not provided -1870 :type header_file_find: bool -1871 :return: a list of table columns. -1872 """ -1873 -1874 table_columns = None -1875 -1876 if not database: -1877 database = self.get_database() -1878 -1879 if not header_file and header_file_find: -1880 header_file = self.get_header_file() -1881 -1882 if not header_file and header_file_find: -1883 header_file = self.find_header_file(database) -1884 -1885 database_format = self.get_format(database=database) -1886 delimiter = SEP_TYPE.get(database_format, "\t") -1887 -1888 # Try from database file -1889 try: -1890 table_header = self.read_header_file(database) -1891 except ValueError: -1892 table_header = None -1893 -1894 if table_header: -1895 try: -1896 table_columns = ( -1897 table_header[self.get_header_length(header_file=table_header)] -1898 .strip() -1899 .split(delimiter) -1900 ) -1901 except IndexError: -1902 table_columns = None -1903 else: -1904 table_columns = None -1905 -1906 if not table_columns: -1907 # Try from header file -1908 try: -1909 table_header = self.read_header_file(header_file) -1910 except ValueError: -1911 table_header = None -1912 -1913 if table_header: -1914 try: -1915 table_columns = ( -1916 table_header[self.get_header_length(header_file=table_header)] -1917 .strip() -1918 .split(delimiter) -1919 ) -1920 except IndexError: -1921 table_columns = None -1922 else: -1923 table_columns = None -1924 -1925 return table_columns -1926 -1927 def get_annotations(self, database: str = None) -> object: -1928 """ -1929 This function returns the annotations of a database or the default database if none is -1930 specified. -1931 -1932 :param database: The parameter `database` is a string that represents the name of the database -1933 to retrieve annotations from. If no database name is provided, the method will use the default -1934 database name obtained from the `get_database()` method -1935 :type database: str -1936 :return: The function `get_annotations` returns the `infos` attribute of the header of a -1937 database. If the `database` parameter is not provided, it gets the current database using the -1938 `get_database` method. If there is no header, it returns `None`. -1939 """ -1940 -1941 if not database: -1942 database = self.get_database() -1943 -1944 if self.get_header(database=database): -1945 return self.get_header(database=database).infos -1946 else: -1947 return None -1948 -1949 def get_extra_columns( -1950 self, database: str = None, database_type: str = None, sql_query: str = None -1951 ) -> list: -1952 """ -1953 This Python function returns a list of extra columns in a database table that are not needed -1954 based on the database type and existing columns. -1955 -1956 :param database: A string representing the name of the database to retrieve columns from. If -1957 None is provided, the default database will be used -1958 :type database: str -1959 :param database_type: The `database_type` parameter in the `get_extra_columns` function -1960 represents the type of the database for which you want to retrieve the list of extra columns. It -1961 is used to determine which columns are needed based on the database type and the existing -1962 columns in the specified database table -1963 :type database_type: str -1964 :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass -1965 an SQL query that can be used to retrieve specific columns from the database. This query can be -1966 customized to filter columns based on certain conditions or criteria before analyzing them to -1967 determine the extra columns that are not needed -1968 :type sql_query: str -1969 :return: A list of extra columns in a database table that are not needed based on the database -1970 type and existing columns. -1971 """ -1972 -1973 if not database: -1974 database = self.get_database() -1975 -1976 if not database: -1977 return [] -1978 -1979 existing_columns = self.get_columns( -1980 database=database, -1981 table=self.get_database_table(database), -1982 sql_query=sql_query, -1983 ) -1984 if not database_type: -1985 database_type = self.get_type(database=database, sql_query=sql_query) -1986 needed_columns = self.get_needed_columns( -1987 database_columns=existing_columns, database_type=database_type -1988 ) -1989 -1990 extra_columns = existing_columns.copy() -1991 -1992 for needed_col in needed_columns: -1993 if needed_columns.get(needed_col) in extra_columns: -1994 extra_columns.remove(needed_columns.get(needed_col)) -1995 -1996 return extra_columns -1997 -1998 def is_vcf(self, database: str = None, sql_query: str = None) -> bool: -1999 """ -2000 The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and -2001 their types. -2002 -2003 :param database: The `database` parameter in the `is_vcf` function is a string that represents -2004 the name of the database that the function will use to check if the file is a VCF (Variant Call -2005 Format) file. If the `database` parameter is not provided when calling the function, it will -2006 :type database: str -2007 :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL -2008 query string that can be used to filter the columns retrieved from the database. This query can -2009 be used to narrow down the columns that are considered when checking if the database is of type -2010 "vcf" -2011 :type sql_query: str -2012 :return: The function `is_vcf` returns a boolean value indicating whether the database type is -2013 "vcf" or not. -2014 """ -2015 -2016 if not database: -2017 database = self.get_database() -2018 -2019 if not database: -2020 return False -2021 -2022 database_columns = self.get_columns( -2023 database=database, -2024 table=self.get_database_table(database), -2025 sql_query=sql_query, -2026 ) -2027 -2028 # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples) -2029 return self.get_type_from_columns( -2030 database_columns=database_columns, check_database_type="vcf" -2031 ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8) -2032 -2033 def get_conn(self): -2034 """ -2035 The function returns the connection object. -2036 :return: The method is returning the value of the instance variable `self.conn`. -2037 """ -2038 -2039 return self.conn -2040 -2041 def is_genotype_column( -2042 self, -2043 column: str, -2044 database: str = None, -2045 downsampling: int = 1000, -2046 check_format: bool = True, -2047 ) -> bool: -2048 """ -2049 The `is_genotype_column` function in Python checks if a specified column in a database contains -2050 genotype data based on a regular expression pattern. -2051 -2052 :param column: The `column` parameter is a string that represents the name of a column in a -2053 database table. It is used to specify the column for which you want to check if it contains -2054 genotype information based on a regular expression pattern -2055 :type column: str -2056 :param database: The `database` parameter in the `is_genotype_column` method is used to specify -2057 the name of the database from which the data will be queried. If a database is provided, the -2058 method will query the specified database to check if the given column contains genotype -2059 information. If no database is provided, -2060 :type database: str -2061 :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an -2062 integer value that determines the number of rows to be sampled from the database table when -2063 checking for genotype information in the specified column. This parameter is used to limit the -2064 number of rows to be processed in order to improve performance, defaults to 1000 -2065 :type downsampling: int (optional) -2066 :param check_format: The `check_format` parameter in the `is_genotype_column` method is a -2067 boolean flag that determines whether the function should check the format of the data before -2068 proceeding with the genotype column analysis. If `check_format` is set to `True`, the function -2069 will verify if the specified column exists in, defaults to True -2070 :type check_format: bool (optional) -2071 :return: The `is_genotype_column` method returns a boolean value. If the specified column in a -2072 database table contains genotype information, it returns `True`; otherwise, it returns `False`. -2073 """ -2074 -2075 # Table variants -2076 table_variants_from = self.get_sql_database_link(database=database) -2077 -2078 # Check if format column is present -2079 if check_format: -2080 query = f""" -2081 SELECT * FROM {table_variants_from} -2082 LIMIT 0 -2083 """ -2084 df = self.query(query=query) -2085 if "FORMAT" not in df.columns or column not in df.columns: -2086 return False -2087 query_format = f""" -2088 AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$')) -2089 """ -2090 else: -2091 query_format = "" -2092 -2093 # Query number of samples -2094 query_downsampling = f""" -2095 SELECT "{column}", FORMAT -2096 FROM {table_variants_from} -2097 LIMIT {downsampling} -2098 """ -2099 df_downsampling = self.query(query=query_downsampling) -2100 -2101 # Query to check genotype -2102 query_genotype = f""" -2103 SELECT * -2104 FROM df_downsampling -2105 WHERE ( -2106 regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*') -2107 {query_format} -2108 ) -2109 """ -2110 df_genotype = self.query(query=query_genotype) -2111 -2112 # return -2113 return len(df_genotype) == len(df_downsampling) -2114 -2115 def export( -2116 self, -2117 output_database: str, -2118 output_header: str = None, -2119 header_in_output: bool = True, -2120 database: str = None, -2121 table: str = "variants", -2122 parquet_partitions: list = None, -2123 threads: int = 1, -2124 sort: bool = False, -2125 index: bool = False, -2126 existing_columns_header: list = [], -2127 order_by: str = "", -2128 query: str = None, -2129 compression_type: str = None, -2130 chunk_size: int = 1000000, -2131 export_mode: str = "pyarrow", -2132 compresslevel: int = 6, -2133 export_header: bool = True, -2134 sample_list: list = None, -2135 ) -> bool: -2136 """ -2137 The `export` function exports data from a database to a specified output format, compresses it -2138 if necessary, and returns a boolean value indicating whether the export was successful or not. -2139 -2140 :param output_database: The `output_database` parameter is a string that represents the path and -2141 filename of the output file to be exported. It specifies where the exported data will be saved -2142 :type output_database: str -2143 :param output_header: The `output_header` parameter is an optional string that represents the -2144 header of the output file. If provided, it specifies the header that will be included in the -2145 output file. If not provided, the header will be automatically detected based on the output file -2146 format -2147 :type output_header: str -2148 :param header_in_output: The `header_in_output` parameter is a boolean value that determines -2149 whether the header should be included in the output file. If set to `True`, the header will be -2150 included in the output file. If set to `False`, the header will not be included in the output -2151 file. By default,, defaults to True -2152 :type header_in_output: bool (optional) -2153 :param database: The `database` parameter is the name of the database from which you want to -2154 export data. If this parameter is not provided, the function will use the `get_database()` -2155 method to retrieve the current database -2156 :type database: str -2157 :param table: The `table` parameter specifies the name of the table in the database from which -2158 the data will be exported. By default, if not specified, it is set to "variants", defaults to -2159 variants -2160 :type table: str (optional) -2161 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the -2162 partition columns for the Parquet output format. Each element in the list represents a partition -2163 column. The partitions are used to organize the data in the Parquet file based on the values of -2164 the specified columns -2165 :type parquet_partitions: list -2166 :param threads: The `threads` parameter in the `export` function is an optional integer that -2167 specifies the number of threads to use for exporting the data. It determines the level of -2168 parallelism during the export process. By default, it is set to 1, defaults to 1 -2169 :type threads: int (optional) -2170 :param sort: The `sort` parameter in the `export` function is a boolean value that specifies -2171 whether the output file should be sorted based on the genomic coordinates of the variants. If -2172 `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults -2173 to False -2174 :type sort: bool (optional) -2175 :param index: The `index` parameter is a boolean value that specifies whether to index the -2176 output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to -2177 `False` or not provided, the output file will not be indexed. By default,, defaults to False -2178 :type index: bool (optional) -2179 :param existing_columns_header: The `existing_columns_header` parameter is a list that -2180 represents the existing columns in the header of the output file. It is used to determine the -2181 columns that should be included in the output file. If this parameter is not provided, the -2182 function will automatically detect the header columns based on the output file format -2183 :type existing_columns_header: list -2184 :param order_by: The `order_by` parameter in the `export` function is a string that specifies -2185 the columns by which the output file should be ordered. You can specify multiple columns -2186 separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC" -2187 (descending) to specify -2188 :type order_by: str -2189 :param query: The `query` parameter in the `export` function represents a SQL query that -2190 specifies the data to be exported from the database. If provided, the function will export the -2191 result of this query. If the `query` parameter is not provided, the function will generate a -2192 query to export the data from -2193 :type query: str -2194 :param compression_type: The `compression_type` parameter in the `export` function specifies the -2195 type of compression to be applied to the output file. By default, the compression type is set to -2196 "bgzip". This parameter allows you to choose the compression algorithm for the output file, such -2197 as "gzip", "bgzip -2198 :type compression_type: str -2199 :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of -2200 each chunk or batch of data that will be processed during the export operation. It determines -2201 how many records or lines of data will be included in each chunk that is processed at a time, -2202 defaults to 1000000 -2203 :type chunk_size: int (optional) -2204 :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of -2205 export, which can be either "pyarrow" or "duckdb", defaults to pyarrow -2206 :type export_mode: str (optional) -2207 :param compresslevel: The `compresslevel` parameter in the `export` function represents the -2208 level of compression for gzip. By default, it is set to 6. This parameter allows you to specify -2209 the compression level when using gzip compression for the output file. The compression level can -2210 range from 0 (no compression), defaults to 6 -2211 :type compresslevel: int (optional) -2212 :param export_header: The `export_header` parameter is a boolean flag that determines whether -2213 the header of a VCF file should be exported to a separate file or not. If `export_header` is -2214 True, the header will be exported to a file. If `export_header` is False, the header will not -2215 be, defaults to True -2216 :type export_header: bool (optional) -2217 :param sample_list: The `sample_list` parameter in the `export` function is a list that -2218 specifies the samples to be included in the exported data. If provided, the samples listed in -2219 this parameter will be included in the output file. If not provided, the function will determine -2220 the samples to include based on the data -2221 :type sample_list: list -2222 :return: The `export` function returns a boolean value indicating whether the export was -2223 successful or not. -2224 """ -2225 -2226 # Full path -2227 output_database = full_path(output_database) -2228 output_header = full_path(output_header) -2229 database = full_path(database) -2230 -2231 # Database -2232 if not database: -2233 database = self.get_database() -2234 if not database: -2235 return False -2236 -2237 # Chunk size -2238 if not chunk_size: -2239 chunk_size = 1000000 -2240 else: -2241 chunk_size = int(chunk_size) -2242 -2243 # Export mode -2244 # Either "pyarrow" (default) or "duckdb" -2245 if not export_mode: -2246 export_mode = "pyarrow" -2247 -2248 # Compression level -2249 if not compresslevel: -2250 compresslevel = 6 -2251 -2252 # Remove output if exists -2253 remove_if_exists(output_database) -2254 -2255 # Tmp -2256 tmp_folder = os.path.dirname(output_database) -2257 if not tmp_folder: -2258 tmp_folder = "." -2259 -2260 with TemporaryDirectory( -2261 dir=tmp_folder, prefix="howard_database_export_" -2262 ) as tmp_dir: -2263 -2264 # tmp files -2265 tmp_files = [] -2266 -2267 # query_set -2268 query_set = "" +1861 def get_table_columns_from_file( +1862 self, +1863 database: str = None, +1864 header_file: str = None, +1865 header_file_find: bool = True, +1866 ) -> list: +1867 """ +1868 The function `get_table_columns_from_file` retrieves the column names from a database or header +1869 file. +1870 +1871 :param database: The `database` parameter is a string that represents the name or path of the +1872 database file. If this parameter is not provided, the `get_database()` method is called to +1873 retrieve the database name or path +1874 :type database: str +1875 :param header_file: The `header_file` parameter is a string that represents the file path or +1876 name of the header file. This file contains the header information for a table, which typically +1877 includes the names of the columns in the table +1878 :type header_file: str +1879 :param header_file_find: Allow header file find if not provided +1880 :type header_file_find: bool +1881 :return: a list of table columns. +1882 """ +1883 +1884 table_columns = None +1885 +1886 if not database: +1887 database = self.get_database() +1888 +1889 if not header_file and header_file_find: +1890 header_file = self.get_header_file() +1891 +1892 if not header_file and header_file_find: +1893 header_file = self.find_header_file(database) +1894 +1895 database_format = self.get_format(database=database) +1896 delimiter = SEP_TYPE.get(database_format, "\t") +1897 +1898 # Try from database file +1899 try: +1900 table_header = self.read_header_file(database) +1901 except ValueError: +1902 table_header = None +1903 +1904 if table_header: +1905 try: +1906 table_columns = ( +1907 table_header[self.get_header_length(header_file=table_header)] +1908 .strip() +1909 .split(delimiter) +1910 ) +1911 except IndexError: +1912 table_columns = None +1913 else: +1914 table_columns = None +1915 +1916 if not table_columns: +1917 # Try from header file +1918 try: +1919 table_header = self.read_header_file(header_file) +1920 except ValueError: +1921 table_header = None +1922 +1923 if table_header: +1924 try: +1925 table_columns = ( +1926 table_header[self.get_header_length(header_file=table_header)] +1927 .strip() +1928 .split(delimiter) +1929 ) +1930 except IndexError: +1931 table_columns = None +1932 else: +1933 table_columns = None +1934 +1935 return table_columns +1936 +1937 def get_annotations(self, database: str = None) -> object: +1938 """ +1939 This function returns the annotations of a database or the default database if none is +1940 specified. +1941 +1942 :param database: The parameter `database` is a string that represents the name of the database +1943 to retrieve annotations from. If no database name is provided, the method will use the default +1944 database name obtained from the `get_database()` method +1945 :type database: str +1946 :return: The function `get_annotations` returns the `infos` attribute of the header of a +1947 database. If the `database` parameter is not provided, it gets the current database using the +1948 `get_database` method. If there is no header, it returns `None`. +1949 """ +1950 +1951 if not database: +1952 database = self.get_database() +1953 +1954 if self.get_header(database=database): +1955 return self.get_header(database=database).infos +1956 else: +1957 return None +1958 +1959 def get_extra_columns( +1960 self, database: str = None, database_type: str = None, sql_query: str = None +1961 ) -> list: +1962 """ +1963 This Python function returns a list of extra columns in a database table that are not needed +1964 based on the database type and existing columns. +1965 +1966 :param database: A string representing the name of the database to retrieve columns from. If +1967 None is provided, the default database will be used +1968 :type database: str +1969 :param database_type: The `database_type` parameter in the `get_extra_columns` function +1970 represents the type of the database for which you want to retrieve the list of extra columns. It +1971 is used to determine which columns are needed based on the database type and the existing +1972 columns in the specified database table +1973 :type database_type: str +1974 :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass +1975 an SQL query that can be used to retrieve specific columns from the database. This query can be +1976 customized to filter columns based on certain conditions or criteria before analyzing them to +1977 determine the extra columns that are not needed +1978 :type sql_query: str +1979 :return: A list of extra columns in a database table that are not needed based on the database +1980 type and existing columns. +1981 """ +1982 +1983 if not database: +1984 database = self.get_database() +1985 +1986 if not database: +1987 return [] +1988 +1989 existing_columns = self.get_columns( +1990 database=database, +1991 table=self.get_database_table(database), +1992 sql_query=sql_query, +1993 ) +1994 if not database_type: +1995 database_type = self.get_type(database=database, sql_query=sql_query) +1996 needed_columns = self.get_needed_columns( +1997 database_columns=existing_columns, database_type=database_type +1998 ) +1999 +2000 extra_columns = existing_columns.copy() +2001 +2002 for needed_col in needed_columns: +2003 if needed_columns.get(needed_col) in extra_columns: +2004 extra_columns.remove(needed_columns.get(needed_col)) +2005 +2006 return extra_columns +2007 +2008 def is_vcf(self, database: str = None, sql_query: str = None) -> bool: +2009 """ +2010 The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and +2011 their types. +2012 +2013 :param database: The `database` parameter in the `is_vcf` function is a string that represents +2014 the name of the database that the function will use to check if the file is a VCF (Variant Call +2015 Format) file. If the `database` parameter is not provided when calling the function, it will +2016 :type database: str +2017 :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL +2018 query string that can be used to filter the columns retrieved from the database. This query can +2019 be used to narrow down the columns that are considered when checking if the database is of type +2020 "vcf" +2021 :type sql_query: str +2022 :return: The function `is_vcf` returns a boolean value indicating whether the database type is +2023 "vcf" or not. +2024 """ +2025 +2026 if not database: +2027 database = self.get_database() +2028 +2029 if not database: +2030 return False +2031 +2032 database_columns = self.get_columns( +2033 database=database, +2034 table=self.get_database_table(database), +2035 sql_query=sql_query, +2036 ) +2037 +2038 # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples) +2039 return self.get_type_from_columns( +2040 database_columns=database_columns, check_database_type="vcf" +2041 ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8) +2042 +2043 def get_conn(self): +2044 """ +2045 The function returns the connection object. +2046 :return: The method is returning the value of the instance variable `self.conn`. +2047 """ +2048 +2049 return self.conn +2050 +2051 def is_genotype_column( +2052 self, +2053 column: str, +2054 database: str = None, +2055 downsampling: int = 1000, +2056 check_format: bool = True, +2057 ) -> bool: +2058 """ +2059 The `is_genotype_column` function in Python checks if a specified column in a database contains +2060 genotype data based on a regular expression pattern. +2061 +2062 :param column: The `column` parameter is a string that represents the name of a column in a +2063 database table. It is used to specify the column for which you want to check if it contains +2064 genotype information based on a regular expression pattern +2065 :type column: str +2066 :param database: The `database` parameter in the `is_genotype_column` method is used to specify +2067 the name of the database from which the data will be queried. If a database is provided, the +2068 method will query the specified database to check if the given column contains genotype +2069 information. If no database is provided, +2070 :type database: str +2071 :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an +2072 integer value that determines the number of rows to be sampled from the database table when +2073 checking for genotype information in the specified column. This parameter is used to limit the +2074 number of rows to be processed in order to improve performance, defaults to 1000 +2075 :type downsampling: int (optional) +2076 :param check_format: The `check_format` parameter in the `is_genotype_column` method is a +2077 boolean flag that determines whether the function should check the format of the data before +2078 proceeding with the genotype column analysis. If `check_format` is set to `True`, the function +2079 will verify if the specified column exists in, defaults to True +2080 :type check_format: bool (optional) +2081 :return: The `is_genotype_column` method returns a boolean value. If the specified column in a +2082 database table contains genotype information, it returns `True`; otherwise, it returns `False`. +2083 """ +2084 +2085 # Table variants +2086 table_variants_from = self.get_sql_database_link(database=database) +2087 +2088 # Check if format column is present +2089 if check_format: +2090 query = f""" +2091 SELECT * FROM {table_variants_from} +2092 LIMIT 0 +2093 """ +2094 df = self.query(query=query) +2095 if "FORMAT" not in df.columns or column not in df.columns: +2096 return False +2097 query_format = f""" +2098 AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$')) +2099 """ +2100 else: +2101 query_format = "" +2102 +2103 # Query number of samples +2104 query_downsampling = f""" +2105 SELECT "{column}", FORMAT +2106 FROM {table_variants_from} +2107 LIMIT {downsampling} +2108 """ +2109 df_downsampling = self.query(query=query_downsampling) +2110 +2111 # Query to check genotype +2112 query_genotype = f""" +2113 SELECT * +2114 FROM df_downsampling +2115 WHERE ( +2116 regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*') +2117 {query_format} +2118 ) +2119 """ +2120 df_genotype = self.query(query=query_genotype) +2121 +2122 # return +2123 return len(df_genotype) == len(df_downsampling) +2124 +2125 def export( +2126 self, +2127 output_database: str, +2128 output_header: str = None, +2129 header_in_output: bool = True, +2130 database: str = None, +2131 table: str = "variants", +2132 parquet_partitions: list = None, +2133 threads: int = 1, +2134 sort: bool = False, +2135 index: bool = False, +2136 existing_columns_header: list = [], +2137 order_by: str = "", +2138 query: str = None, +2139 compression_type: str = None, +2140 chunk_size: int = 1000000, +2141 export_mode: str = "pyarrow", +2142 compresslevel: int = 6, +2143 export_header: bool = True, +2144 sample_list: list = None, +2145 ) -> bool: +2146 """ +2147 The `export` function exports data from a database to a specified output format, compresses it +2148 if necessary, and returns a boolean value indicating whether the export was successful or not. +2149 +2150 :param output_database: The `output_database` parameter is a string that represents the path and +2151 filename of the output file to be exported. It specifies where the exported data will be saved +2152 :type output_database: str +2153 :param output_header: The `output_header` parameter is an optional string that represents the +2154 header of the output file. If provided, it specifies the header that will be included in the +2155 output file. If not provided, the header will be automatically detected based on the output file +2156 format +2157 :type output_header: str +2158 :param header_in_output: The `header_in_output` parameter is a boolean value that determines +2159 whether the header should be included in the output file. If set to `True`, the header will be +2160 included in the output file. If set to `False`, the header will not be included in the output +2161 file. By default,, defaults to True +2162 :type header_in_output: bool (optional) +2163 :param database: The `database` parameter is the name of the database from which you want to +2164 export data. If this parameter is not provided, the function will use the `get_database()` +2165 method to retrieve the current database +2166 :type database: str +2167 :param table: The `table` parameter specifies the name of the table in the database from which +2168 the data will be exported. By default, if not specified, it is set to "variants", defaults to +2169 variants +2170 :type table: str (optional) +2171 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the +2172 partition columns for the Parquet output format. Each element in the list represents a partition +2173 column. The partitions are used to organize the data in the Parquet file based on the values of +2174 the specified columns +2175 :type parquet_partitions: list +2176 :param threads: The `threads` parameter in the `export` function is an optional integer that +2177 specifies the number of threads to use for exporting the data. It determines the level of +2178 parallelism during the export process. By default, it is set to 1, defaults to 1 +2179 :type threads: int (optional) +2180 :param sort: The `sort` parameter in the `export` function is a boolean value that specifies +2181 whether the output file should be sorted based on the genomic coordinates of the variants. If +2182 `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults +2183 to False +2184 :type sort: bool (optional) +2185 :param index: The `index` parameter is a boolean value that specifies whether to index the +2186 output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to +2187 `False` or not provided, the output file will not be indexed. By default,, defaults to False +2188 :type index: bool (optional) +2189 :param existing_columns_header: The `existing_columns_header` parameter is a list that +2190 represents the existing columns in the header of the output file. It is used to determine the +2191 columns that should be included in the output file. If this parameter is not provided, the +2192 function will automatically detect the header columns based on the output file format +2193 :type existing_columns_header: list +2194 :param order_by: The `order_by` parameter in the `export` function is a string that specifies +2195 the columns by which the output file should be ordered. You can specify multiple columns +2196 separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC" +2197 (descending) to specify +2198 :type order_by: str +2199 :param query: The `query` parameter in the `export` function represents a SQL query that +2200 specifies the data to be exported from the database. If provided, the function will export the +2201 result of this query. If the `query` parameter is not provided, the function will generate a +2202 query to export the data from +2203 :type query: str +2204 :param compression_type: The `compression_type` parameter in the `export` function specifies the +2205 type of compression to be applied to the output file. By default, the compression type is set to +2206 "bgzip". This parameter allows you to choose the compression algorithm for the output file, such +2207 as "gzip", "bgzip +2208 :type compression_type: str +2209 :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of +2210 each chunk or batch of data that will be processed during the export operation. It determines +2211 how many records or lines of data will be included in each chunk that is processed at a time, +2212 defaults to 1000000 +2213 :type chunk_size: int (optional) +2214 :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of +2215 export, which can be either "pyarrow" or "duckdb", defaults to pyarrow +2216 :type export_mode: str (optional) +2217 :param compresslevel: The `compresslevel` parameter in the `export` function represents the +2218 level of compression for gzip. By default, it is set to 6. This parameter allows you to specify +2219 the compression level when using gzip compression for the output file. The compression level can +2220 range from 0 (no compression), defaults to 6 +2221 :type compresslevel: int (optional) +2222 :param export_header: The `export_header` parameter is a boolean flag that determines whether +2223 the header of a VCF file should be exported to a separate file or not. If `export_header` is +2224 True, the header will be exported to a file. If `export_header` is False, the header will not +2225 be, defaults to True +2226 :type export_header: bool (optional) +2227 :param sample_list: The `sample_list` parameter in the `export` function is a list that +2228 specifies the samples to be included in the exported data. If provided, the samples listed in +2229 this parameter will be included in the output file. If not provided, the function will determine +2230 the samples to include based on the data +2231 :type sample_list: list +2232 :return: The `export` function returns a boolean value indicating whether the export was +2233 successful or not. +2234 """ +2235 +2236 # Full path +2237 output_database = full_path(output_database) +2238 output_header = full_path(output_header) +2239 database = full_path(database) +2240 +2241 # Database +2242 if not database: +2243 database = self.get_database() +2244 if not database: +2245 return False +2246 +2247 # Chunk size +2248 if not chunk_size: +2249 chunk_size = 1000000 +2250 else: +2251 chunk_size = int(chunk_size) +2252 +2253 # Export mode +2254 # Either "pyarrow" (default) or "duckdb" +2255 if not export_mode: +2256 export_mode = "pyarrow" +2257 +2258 # Compression level +2259 if not compresslevel: +2260 compresslevel = 6 +2261 +2262 # Remove output if exists +2263 remove_if_exists(output_database) +2264 +2265 # Tmp +2266 tmp_folder = os.path.dirname(output_database) +2267 if not tmp_folder: +2268 tmp_folder = "." 2269 -2270 # Header columns -2271 if not existing_columns_header and output_header: -2272 existing_columns_header = self.get_header_file_columns(output_header) +2270 with TemporaryDirectory( +2271 dir=tmp_folder, prefix="howard_database_export_" +2272 ) as tmp_dir: 2273 -2274 # Auto-detect output type and compression and delimiter -2275 output_type = get_file_format(output_database) -2276 compressed = self.is_compressed(database=output_database) -2277 delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t") -2278 -2279 # database type -2280 if output_type in ["vcf"]: -2281 database_type = "vcf" -2282 elif output_type in ["bed"]: -2283 database_type = "regions" -2284 else: -2285 database_type = self.get_type(database=database, sql_query=query) -2286 -2287 # database object -2288 # If database is string, then create database conn -2289 if isinstance(database, str): -2290 database_conn = Database(database=database).get_conn() -2291 else: -2292 database_conn = database -2293 -2294 # Existing columns -2295 existing_columns = self.get_columns( -2296 database=database_conn, -2297 table=self.get_database_table(database=database), -2298 sql_query=query, -2299 ) -2300 -2301 # Extra columns -2302 extra_columns = self.get_extra_columns( -2303 database=database_conn, database_type=output_type, sql_query=query -2304 ) -2305 -2306 # Needed columns -2307 needed_columns = self.get_needed_columns( -2308 database_columns=existing_columns, database_type=database_type +2274 # tmp files +2275 tmp_files = [] +2276 +2277 # query_set +2278 query_set = "" +2279 +2280 # Header columns +2281 if not existing_columns_header and output_header: +2282 existing_columns_header = self.get_header_file_columns(output_header) +2283 +2284 # Auto-detect output type and compression and delimiter +2285 output_type = get_file_format(output_database) +2286 compressed = self.is_compressed(database=output_database) +2287 delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t") +2288 +2289 # database type +2290 if output_type in ["vcf"]: +2291 database_type = "vcf" +2292 elif output_type in ["bed"]: +2293 database_type = "regions" +2294 else: +2295 database_type = self.get_type(database=database, sql_query=query) +2296 +2297 # database object +2298 # If database is string, then create database conn +2299 if isinstance(database, str): +2300 database_conn = Database(database=database).get_conn() +2301 else: +2302 database_conn = database +2303 +2304 # Existing columns +2305 existing_columns = self.get_columns( +2306 database=database_conn, +2307 table=self.get_database_table(database=database), +2308 sql_query=query, 2309 ) 2310 -2311 # Order by -2312 order_by_list = [] -2313 if order_by: -2314 # Split order by options -2315 order_by_split = order_by.split(",") -2316 for order_by_option in order_by_split: -2317 # Split order by option -2318 order_by_option_split = order_by_option.strip().split(" ") -2319 order_by_option_split_column = order_by_option_split[0] -2320 if len(order_by_option_split) > 1: -2321 order_by_option_split_order = order_by_option_split[1] -2322 else: -2323 order_by_option_split_order = "ASC" -2324 # Chek if column exists -2325 if ( -2326 order_by_option_split_column.replace('"', "").strip() -2327 in existing_columns -2328 ): -2329 order_by_list.append( -2330 f"{order_by_option_split_column} {order_by_option_split_order}" -2331 ) -2332 -2333 # Clean order by -2334 order_by_clean = ", ".join(order_by_list) -2335 -2336 # Query values -2337 default_empty_value = "" -2338 query_export_format = None -2339 include_header = False -2340 post_process = False -2341 order_by_sql = "" -2342 post_process_just_move = False -2343 -2344 # export options -2345 export_options = {} -2346 -2347 # VCF -2348 if output_type in ["vcf"]: -2349 if not self.is_vcf(database=database_conn, sql_query=query): -2350 extra_columns = [] -2351 else: -2352 extra_columns = existing_columns_header +2311 # Extra columns +2312 extra_columns = self.get_extra_columns( +2313 database=database_conn, database_type=output_type, sql_query=query +2314 ) +2315 +2316 # Needed columns +2317 needed_columns = self.get_needed_columns( +2318 database_columns=existing_columns, database_type=database_type +2319 ) +2320 +2321 # Order by +2322 order_by_list = [] +2323 if order_by: +2324 # Split order by options +2325 order_by_split = order_by.split(",") +2326 for order_by_option in order_by_split: +2327 # Split order by option +2328 order_by_option_split = order_by_option.strip().split(" ") +2329 order_by_option_split_column = order_by_option_split[0] +2330 if len(order_by_option_split) > 1: +2331 order_by_option_split_order = order_by_option_split[1] +2332 else: +2333 order_by_option_split_order = "ASC" +2334 # Chek if column exists +2335 if ( +2336 order_by_option_split_column.replace('"', "").strip() +2337 in existing_columns +2338 ): +2339 order_by_list.append( +2340 f"{order_by_option_split_column} {order_by_option_split_order}" +2341 ) +2342 +2343 # Clean order by +2344 order_by_clean = ", ".join(order_by_list) +2345 +2346 # Query values +2347 default_empty_value = "" +2348 query_export_format = None +2349 include_header = False +2350 post_process = False +2351 order_by_sql = "" +2352 post_process_just_move = False 2353 -2354 # Check VCF format with extra columns -2355 extra_columns_clean = [] +2354 # export options +2355 export_options = {} 2356 -2357 # Force samples list in parameter -2358 if sample_list: -2359 if "FORMAT" in extra_columns: -2360 extra_columns = ["FORMAT"] + sample_list -2361 else: -2362 extra_columns = sample_list +2357 # VCF +2358 if output_type in ["vcf"]: +2359 if not self.is_vcf(database=database_conn, sql_query=query): +2360 extra_columns = [] +2361 else: +2362 extra_columns = existing_columns_header 2363 -2364 # Check columns -2365 else: -2366 for extra_column in extra_columns: -2367 if extra_column not in needed_columns and ( -2368 extra_column == "FORMAT" -2369 or ( -2370 "FORMAT" in extra_columns_clean -2371 and self.is_genotype_column( -2372 database=database, column=extra_column -2373 ) -2374 ) -2375 ): -2376 extra_columns_clean.append(extra_column) -2377 extra_columns = extra_columns_clean -2378 -2379 default_empty_value = "." -2380 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'" -2381 include_header = True -2382 post_process = True -2383 # Export options -2384 if not compression_type: -2385 compression_type = "bgzip" -2386 export_options = { -2387 "format": "CSV", -2388 "delimiter": delimiter, -2389 "header": True, -2390 "quote": None, -2391 "compression": compression_type, -2392 } -2393 compresslevel = 1 -2394 -2395 # TSV/CSV/TBL -2396 elif output_type in ["tsv", "csv", "tbl"]: -2397 if output_type in ["csv", "tbl"]: -2398 quote = '"' -2399 else: -2400 quote = "" -2401 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'" -2402 if delimiter in ["\t"]: -2403 include_header = header_in_output and True -2404 post_process = True -2405 if order_by_clean: -2406 order_by_sql = f"ORDER BY {order_by_clean}" -2407 # Export options -2408 if not compression_type: -2409 compression_type = "gzip" -2410 export_options = { -2411 "format": "CSV", -2412 "delimiter": delimiter, -2413 "header": True, -2414 "quote": quote, -2415 "compression": compression_type, -2416 } -2417 -2418 # JSON -2419 elif output_type in ["json"]: -2420 query_export_format = "FORMAT JSON, ARRAY TRUE" -2421 include_header = False -2422 post_process = True -2423 if order_by_clean: -2424 order_by_sql = f"ORDER BY {order_by_clean}" -2425 # Export options -2426 if not compression_type: -2427 compression_type = "gzip" -2428 export_options = { -2429 "format": "JSON", -2430 "array": True, -2431 "compression": compression_type, -2432 } -2433 -2434 # Parquet -2435 elif output_type in ["parquet"]: -2436 query_export_format = "FORMAT PARQUET" -2437 # Export options +2364 # Check VCF format with extra columns +2365 extra_columns_clean = [] +2366 +2367 # Force samples list in parameter +2368 if sample_list: +2369 if "FORMAT" in extra_columns: +2370 extra_columns = ["FORMAT"] + sample_list +2371 else: +2372 extra_columns = sample_list +2373 +2374 # Check columns +2375 else: +2376 for extra_column in extra_columns: +2377 if extra_column not in needed_columns and ( +2378 extra_column == "FORMAT" +2379 or ( +2380 "FORMAT" in extra_columns_clean +2381 and self.is_genotype_column( +2382 database=database, column=extra_column +2383 ) +2384 ) +2385 ): +2386 extra_columns_clean.append(extra_column) +2387 extra_columns = extra_columns_clean +2388 +2389 default_empty_value = "." +2390 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'" +2391 include_header = True +2392 post_process = True +2393 # Export options +2394 if not compression_type: +2395 compression_type = "bgzip" +2396 export_options = { +2397 "format": "CSV", +2398 "delimiter": delimiter, +2399 "header": True, +2400 "quote": None, +2401 "compression": compression_type, +2402 } +2403 compresslevel = 1 +2404 +2405 # TSV/CSV/TBL +2406 elif output_type in ["tsv", "csv", "tbl"]: +2407 if output_type in ["csv", "tbl"]: +2408 quote = '"' +2409 else: +2410 quote = "" +2411 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'" +2412 if delimiter in ["\t"]: +2413 include_header = header_in_output and True +2414 post_process = True +2415 if order_by_clean: +2416 order_by_sql = f"ORDER BY {order_by_clean}" +2417 # Export options +2418 if not compression_type: +2419 compression_type = "gzip" +2420 export_options = { +2421 "format": "CSV", +2422 "delimiter": delimiter, +2423 "header": True, +2424 "quote": quote, +2425 "compression": compression_type, +2426 } +2427 +2428 # JSON +2429 elif output_type in ["json"]: +2430 query_export_format = "FORMAT JSON, ARRAY TRUE" +2431 include_header = False +2432 post_process = True +2433 if order_by_clean: +2434 order_by_sql = f"ORDER BY {order_by_clean}" +2435 # Export options +2436 if not compression_type: +2437 compression_type = "gzip" 2438 export_options = { -2439 "format": "PARQUET", -2440 } -2441 include_header = False -2442 post_process = True +2439 "format": "JSON", +2440 "array": True, +2441 "compression": compression_type, +2442 } 2443 -2444 # BED -2445 elif output_type in ["bed"]: -2446 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER" -2447 include_header = True -2448 post_process = True -2449 if order_by_clean: -2450 order_by_sql = f"ORDER BY {order_by_clean}" -2451 # Export options -2452 if not compression_type: -2453 compression_type = "gzip" -2454 export_options = { -2455 "format": "CSV", -2456 "delimiter": delimiter, -2457 "header": True, -2458 "quote": None, -2459 "compression": compression_type, -2460 } -2461 -2462 # duckDB -2463 elif output_type in ["duckdb"]: -2464 -2465 # Needed column -2466 needed_columns = [] -2467 -2468 # Export database as Parquet -2469 database_export_parquet_file = os.path.join(tmp_dir, "output.parquet") -2470 self.export( -2471 database=database, output_database=database_export_parquet_file -2472 ) -2473 -2474 # Create database and connexion -2475 output_database_conn = duckdb.connect(output_database) -2476 -2477 # Create table in database connexion with Parquet file -2478 query_copy = f""" -2479 CREATE TABLE {table} -2480 AS {self.get_sql_database_link(database=database_export_parquet_file)} -2481 """ -2482 output_database_conn.execute(query_copy) +2444 # Parquet +2445 elif output_type in ["parquet"]: +2446 query_export_format = "FORMAT PARQUET" +2447 # Export options +2448 export_options = { +2449 "format": "PARQUET", +2450 } +2451 include_header = False +2452 post_process = True +2453 +2454 # BED +2455 elif output_type in ["bed"]: +2456 query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER" +2457 include_header = True +2458 post_process = True +2459 if order_by_clean: +2460 order_by_sql = f"ORDER BY {order_by_clean}" +2461 # Export options +2462 if not compression_type: +2463 compression_type = "gzip" +2464 export_options = { +2465 "format": "CSV", +2466 "delimiter": delimiter, +2467 "header": True, +2468 "quote": None, +2469 "compression": compression_type, +2470 } +2471 +2472 # duckDB +2473 elif output_type in ["duckdb"]: +2474 +2475 # Needed column +2476 needed_columns = [] +2477 +2478 # Export database as Parquet +2479 database_export_parquet_file = os.path.join(tmp_dir, "output.parquet") +2480 self.export( +2481 database=database, output_database=database_export_parquet_file +2482 ) 2483 -2484 # Close connexion -2485 output_database_conn.close() +2484 # Create database and connexion +2485 output_database_conn = duckdb.connect(output_database) 2486 -2487 # remove tmp -2488 remove_if_exists([database_export_parquet_file]) -2489 -2490 return os.path.exists(output_database) -2491 -2492 # Partition -2493 if parquet_partitions: -2494 parquet_partitions_clean = [] -2495 parquet_partitions_array = [] -2496 for parquet_partition in parquet_partitions: -2497 parquet_partitions_array.append( -2498 parquet_partition.translate({'"': None, "'": None, " ": None}) -2499 ) -2500 parquet_partitions_clean.append( -2501 '"' -2502 + parquet_partition.translate({'"': None, "'": None, " ": None}) -2503 + '"' -2504 ) -2505 parquet_partitions_by = ",".join(parquet_partitions_clean) -2506 query_export_format += ( -2507 f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE" -2508 ) -2509 export_options["partition_by"] = parquet_partitions_array -2510 if export_options.get("format", None) == "CSV": -2511 export_mode = "duckdb" -2512 post_process_just_move = True -2513 -2514 # Construct query columns -2515 query_columns = [] -2516 -2517 # Add Needed columns -2518 for needed_column in needed_columns: -2519 if needed_columns[needed_column]: -2520 query_column_name = needed_columns[needed_column] -2521 query_column = f""" "{needed_columns[needed_column]}" """ -2522 else: -2523 query_column_name = default_empty_value -2524 query_column = f""" '{default_empty_value}' """ -2525 query_column_as = f""" "{needed_column}" """ -2526 if query_column_name == needed_column: -2527 query_columns.append(f""" {query_column} """) -2528 else: -2529 query_columns.append(f""" {query_column} AS {query_column_as} """) -2530 -2531 # Add Extra columns -2532 for extra_column in extra_columns: -2533 if extra_column not in needed_columns: -2534 # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """) -2535 query_columns.append(f""" "{extra_column}" """) -2536 -2537 # Query export columns -2538 query_export_columns = f""" {",".join(query_columns)} """ -2539 -2540 if query_columns: -2541 -2542 # Compressed tmp file -2543 query_output_database_tmp = os.path.join(tmp_dir, "output") -2544 -2545 # Query -2546 # If no query, generate query of the database -2547 if not query: -2548 query = f""" -2549 SELECT {query_export_columns} -2550 FROM {self.get_sql_database_link(database=database)} -2551 {order_by_sql} -2552 """ -2553 -2554 # Test empty query -2555 df = self.conn.execute(query).fetch_record_batch(1) -2556 query_empty = True -2557 for d in df: -2558 query_empty = False -2559 break -2560 if query_empty: -2561 log.error("Export failed: Empty") -2562 raise ValueError("Export failed: Empty") +2487 # Create table in database connexion with Parquet file +2488 query_copy = f""" +2489 CREATE TABLE {table} +2490 AS {self.get_sql_database_link(database=database_export_parquet_file)} +2491 """ +2492 output_database_conn.execute(query_copy) +2493 +2494 # Close connexion +2495 output_database_conn.close() +2496 +2497 # remove tmp +2498 remove_if_exists([database_export_parquet_file]) +2499 +2500 return os.path.exists(output_database) +2501 +2502 # Partition +2503 if parquet_partitions: +2504 parquet_partitions_clean = [] +2505 parquet_partitions_array = [] +2506 for parquet_partition in parquet_partitions: +2507 parquet_partitions_array.append( +2508 parquet_partition.translate({'"': None, "'": None, " ": None}) +2509 ) +2510 parquet_partitions_clean.append( +2511 '"' +2512 + parquet_partition.translate({'"': None, "'": None, " ": None}) +2513 + '"' +2514 ) +2515 parquet_partitions_by = ",".join(parquet_partitions_clean) +2516 query_export_format += ( +2517 f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE" +2518 ) +2519 export_options["partition_by"] = parquet_partitions_array +2520 if export_options.get("format", None) == "CSV": +2521 export_mode = "duckdb" +2522 post_process_just_move = True +2523 +2524 # Construct query columns +2525 query_columns = [] +2526 +2527 # Add Needed columns +2528 for needed_column in needed_columns: +2529 if needed_columns[needed_column]: +2530 query_column_name = needed_columns[needed_column] +2531 query_column = f""" "{needed_columns[needed_column]}" """ +2532 else: +2533 query_column_name = default_empty_value +2534 query_column = f""" '{default_empty_value}' """ +2535 query_column_as = f""" "{needed_column}" """ +2536 if query_column_name == needed_column: +2537 query_columns.append(f""" {query_column} """) +2538 else: +2539 query_columns.append(f""" {query_column} AS {query_column_as} """) +2540 +2541 # Add Extra columns +2542 for extra_column in extra_columns: +2543 if extra_column not in needed_columns: +2544 # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """) +2545 query_columns.append(f""" "{extra_column}" """) +2546 +2547 # Query export columns +2548 query_export_columns = f""" {",".join(query_columns)} """ +2549 +2550 if query_columns: +2551 +2552 # Compressed tmp file +2553 query_output_database_tmp = os.path.join(tmp_dir, "output") +2554 +2555 # Query +2556 # If no query, generate query of the database +2557 if not query: +2558 query = f""" +2559 SELECT {query_export_columns} +2560 FROM {self.get_sql_database_link(database=database)} +2561 {order_by_sql} +2562 """ 2563 -2564 # Schema names -2565 schema_names = None -2566 -2567 # Export mode pyarrow -2568 if export_mode == "pyarrow": -2569 -2570 # Compression mode -2571 # If compress required and compression type as gzip or bgzip -2572 # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1) -2573 # to reduce tmp file size -2574 compression_mode_gzip = compressed and ( -2575 export_options.get("compression", None) in ["gzip", "bgzip"] -2576 ) -2577 -2578 # File stream mode (str or bytes) -2579 f_mode = "" -2580 if compression_mode_gzip: -2581 f_mode = "b" -2582 -2583 if include_header: -2584 -2585 # Open stream files (uncompressed and compressed) -2586 with open(query_output_database_tmp, mode="w") as f, pgzip.open( -2587 query_output_database_tmp, -2588 mode="w", -2589 thread=threads, -2590 compresslevel=compresslevel, -2591 ) as f_gz: -2592 -2593 # Switch to compressed stream file -2594 if compression_mode_gzip: -2595 f = f_gz +2564 # Test empty query +2565 df = self.conn.execute(query).fetch_record_batch(1) +2566 query_empty = True +2567 for d in df: +2568 query_empty = False +2569 break +2570 if query_empty: +2571 log.warning("Export warning: Empty") +2572 remove_header_line = False +2573 else: +2574 remove_header_line = True +2575 +2576 # Schema names +2577 schema_names = None +2578 +2579 # Export mode pyarrow +2580 if export_mode == "pyarrow": +2581 +2582 # Compression mode +2583 # If compress required and compression type as gzip or bgzip +2584 # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1) +2585 # to reduce tmp file size +2586 compression_mode_gzip = compressed and ( +2587 export_options.get("compression", None) in ["gzip", "bgzip"] +2588 ) +2589 +2590 # File stream mode (str or bytes) +2591 f_mode = "" +2592 if compression_mode_gzip: +2593 f_mode = "b" +2594 +2595 if include_header: 2596 -2597 # Generate header tmp file -2598 query_output_header_tmp = os.path.join(tmp_dir, "header") -2599 self.get_header_file( -2600 header_file=query_output_header_tmp, -2601 remove_header_line=True, -2602 sql_query=query, -2603 ) +2597 # Open stream files (uncompressed and compressed) +2598 with open(query_output_database_tmp, mode="w") as f, pgzip.open( +2599 query_output_database_tmp, +2600 mode="w", +2601 thread=threads, +2602 compresslevel=compresslevel, +2603 ) as f_gz: 2604 -2605 # Write header to tmp file -2606 with open( -2607 query_output_header_tmp, "r" + f_mode -2608 ) as output_header_tmp: -2609 f.write(output_header_tmp.read()) -2610 -2611 # JSON format - Add special "[" character at the beginning of the file -2612 if export_options.get("format") in ["JSON"]: -2613 -2614 # Open stream files (uncompressed and compressed) -2615 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2616 query_output_database_tmp, -2617 mode="a", -2618 thread=threads, -2619 compresslevel=compresslevel, -2620 ) as f_gz: -2621 -2622 # Switch to compressed stream file -2623 if compression_mode_gzip: -2624 f = f_gz -2625 f.write(b"[\n") -2626 else: -2627 f.write("[\n") -2628 -2629 # Open stream files (uncompressed and compressed) for chunk -2630 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2631 query_output_database_tmp, -2632 mode="a", -2633 thread=threads, -2634 compresslevel=compresslevel, -2635 ) as f_gz: -2636 -2637 # Switch to compressed stream file -2638 if compression_mode_gzip: -2639 f = f_gz +2605 # Switch to compressed stream file +2606 if compression_mode_gzip: +2607 f = f_gz +2608 +2609 # Generate header tmp file +2610 query_output_header_tmp = os.path.join(tmp_dir, "header") +2611 self.get_header_file( +2612 header_file=query_output_header_tmp, +2613 remove_header_line=remove_header_line, +2614 sql_query=query, +2615 ) +2616 +2617 # Write header to tmp file +2618 with open( +2619 query_output_header_tmp, "r" + f_mode +2620 ) as output_header_tmp: +2621 f.write(output_header_tmp.read()) +2622 +2623 # JSON format - Add special "[" character at the beginning of the file +2624 if export_options.get("format") in ["JSON"]: +2625 +2626 # Open stream files (uncompressed and compressed) +2627 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2628 query_output_database_tmp, +2629 mode="a", +2630 thread=threads, +2631 compresslevel=compresslevel, +2632 ) as f_gz: +2633 +2634 # Switch to compressed stream file +2635 if compression_mode_gzip: +2636 f = f_gz +2637 f.write(b"[\n") +2638 else: +2639 f.write("[\n") 2640 -2641 # Chunk query with batch of dataframes of chunk_size -2642 df = self.conn.execute(query).fetch_record_batch(chunk_size) -2643 -2644 # id of chunk -2645 i = 0 -2646 -2647 # For each chunk dataframe -2648 for d in df: -2649 -2650 # Schema names -2651 schema_names = d.schema.names +2641 # Open stream files (uncompressed and compressed) for chunk +2642 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2643 query_output_database_tmp, +2644 mode="a", +2645 thread=threads, +2646 compresslevel=compresslevel, +2647 ) as f_gz: +2648 +2649 # Switch to compressed stream file +2650 if compression_mode_gzip: +2651 f = f_gz 2652 -2653 # id of chunk -2654 i += 1 +2653 # Chunk query with batch of dataframes of chunk_size +2654 df = self.conn.execute(query).fetch_record_batch(chunk_size) 2655 -2656 # Log - number of records -2657 log.debug(f"Chunk {i}: records process...") +2656 # id of chunk +2657 i = 0 2658 -2659 # Check process for first chunk -2660 if i == 1: +2659 # For each chunk dataframe +2660 for d in df: 2661 -2662 # If include header in file -2663 header = export_options.get("header", True) +2662 # Schema names +2663 schema_names = d.schema.names 2664 -2665 # Parquet output format -2666 # Either a folder or a writer -2667 if export_options.get("format") in ["PARQUET"]: -2668 -2669 # For Parquet with multiple file - folder -2670 if export_options.get( -2671 "partition_by", None -2672 ) or export_options.get("per_thread_output", False): -2673 query_output_database_tmp = ( -2674 f"{query_output_database_tmp}.parquet" -2675 ) +2665 # id of chunk +2666 i += 1 +2667 +2668 # Log - number of records +2669 log.debug(f"Chunk {i}: records process...") +2670 +2671 # Check process for first chunk +2672 if i == 1: +2673 +2674 # If include header in file +2675 header = export_options.get("header", True) 2676 -2677 # For Parquet as a unique file - writer -2678 else: -2679 writer = pq.ParquetWriter( -2680 query_output_database_tmp, d.schema -2681 ) -2682 -2683 else: -2684 -2685 # Switch of header in file for not first chunk -2686 header = False -2687 -2688 # CSV format -2689 if export_options.get("format") in ["CSV"]: -2690 -2691 # With quote option -2692 if export_options.get("quote", None): -2693 -2694 # Polars write dataframe -2695 pl.from_arrow(d).write_csv( -2696 file=f, -2697 separator=export_options.get("delimiter", ""), -2698 include_header=header, -2699 quote_char=export_options.get("quote", '"'), -2700 ) -2701 -2702 # Without quote option -2703 else: -2704 -2705 # Polars write dataframe -2706 pl.from_arrow(d).write_csv( -2707 file=f, -2708 separator=export_options.get("delimiter", ""), -2709 include_header=header, -2710 quote_style="never", -2711 ) -2712 -2713 # JSON format -2714 elif export_options.get("format") in ["JSON"]: -2715 -2716 # Compressed mode gzip -2717 if compression_mode_gzip: -2718 -2719 # Add comma at the beginning of dataframe (if not the first one) in bytes mode -2720 if i > 1: -2721 f.write(b",\n") -2722 -2723 # Write dataframe in bytes mode -2724 f.write( -2725 str.encode( -2726 pl.from_arrow(d) -2727 .write_ndjson() -2728 .replace("\n{", ",\n{") -2729 .replace("[", "") -2730 .replace("]", "") -2731 ) -2732 ) -2733 -2734 # Not compressed mode gzip (string mode) -2735 else: -2736 -2737 # Add comma at the beginning of dataframe (if not the first one) in string mode -2738 if i > 1: -2739 f.write(",\n") -2740 -2741 # Write dataframe in string mode -2742 f.write( -2743 pl.from_arrow(d) -2744 .write_ndjson() -2745 .replace("\n{", ",\n{") -2746 .replace("[", "") -2747 .replace("]", "") -2748 ) -2749 -2750 # Parquet format -2751 elif export_options.get("format") in ["PARQUET"]: +2677 # Parquet output format +2678 # Either a folder or a writer +2679 if export_options.get("format") in ["PARQUET"]: +2680 +2681 # For Parquet with multiple file - folder +2682 if export_options.get( +2683 "partition_by", None +2684 ) or export_options.get("per_thread_output", False): +2685 query_output_database_tmp = ( +2686 f"{query_output_database_tmp}.parquet" +2687 ) +2688 +2689 # For Parquet as a unique file - writer +2690 else: +2691 writer = pq.ParquetWriter( +2692 query_output_database_tmp, d.schema +2693 ) +2694 +2695 else: +2696 +2697 # Switch of header in file for not first chunk +2698 header = False +2699 +2700 # CSV format +2701 if export_options.get("format") in ["CSV"]: +2702 +2703 # With quote option +2704 if export_options.get("quote", None): +2705 +2706 # Polars write dataframe +2707 pl.from_arrow(d).write_csv( +2708 file=f, +2709 separator=export_options.get("delimiter", ""), +2710 include_header=header, +2711 quote_char=export_options.get("quote", '"'), +2712 ) +2713 +2714 # Without quote option +2715 else: +2716 +2717 # Polars write dataframe +2718 pl.from_arrow(d).write_csv( +2719 file=f, +2720 separator=export_options.get("delimiter", ""), +2721 include_header=header, +2722 quote_style="never", +2723 ) +2724 +2725 # JSON format +2726 elif export_options.get("format") in ["JSON"]: +2727 +2728 # Compressed mode gzip +2729 if compression_mode_gzip: +2730 +2731 # Add comma at the beginning of dataframe (if not the first one) in bytes mode +2732 if i > 1: +2733 f.write(b",\n") +2734 +2735 # Write dataframe in bytes mode +2736 f.write( +2737 str.encode( +2738 pl.from_arrow(d) +2739 .write_ndjson() +2740 .replace("\n{", ",\n{") +2741 .replace("[", "") +2742 .replace("]", "") +2743 ) +2744 ) +2745 +2746 # Not compressed mode gzip (string mode) +2747 else: +2748 +2749 # Add comma at the beginning of dataframe (if not the first one) in string mode +2750 if i > 1: +2751 f.write(",\n") 2752 -2753 # Partition by fields -2754 partition_by = export_options.get("partition_by", None) -2755 -2756 if partition_by: -2757 -2758 # For No partition but split parquet files into a folder -2759 if "None" in partition_by: -2760 partition_by = None +2753 # Write dataframe in string mode +2754 f.write( +2755 pl.from_arrow(d) +2756 .write_ndjson() +2757 .replace("\n{", ",\n{") +2758 .replace("[", "") +2759 .replace("]", "") +2760 ) 2761 -2762 # Pyarrow write -2763 pq.write_to_dataset( -2764 pa.Table.from_batches([d]), -2765 query_output_database_tmp, -2766 partition_cols=partition_by, -2767 use_threads=threads, -2768 existing_data_behavior="overwrite_or_ignore", -2769 ) -2770 -2771 # Parquet in unique file -2772 else: -2773 writer.write_batch(d) -2774 -2775 # Close Parquet writer -2776 if ( -2777 export_options.get("format") in ["PARQUET"] -2778 and not export_options.get("partition_by", None) -2779 and not export_options.get("per_thread_output", None) -2780 ): -2781 writer.close() +2762 # Parquet format +2763 elif export_options.get("format") in ["PARQUET"]: +2764 +2765 # Partition by fields +2766 partition_by = export_options.get("partition_by", None) +2767 +2768 if partition_by: +2769 +2770 # For No partition but split parquet files into a folder +2771 if "None" in partition_by: +2772 partition_by = None +2773 +2774 # Pyarrow write +2775 pq.write_to_dataset( +2776 pa.Table.from_batches([d]), +2777 query_output_database_tmp, +2778 partition_cols=partition_by, +2779 use_threads=threads, +2780 existing_data_behavior="overwrite_or_ignore", +2781 ) 2782 -2783 # JSON format - Add special "]" character at the end of the file -2784 if export_options.get("format") in ["JSON"]: -2785 -2786 # Open stream files (uncompressed and compressed) -2787 with open(query_output_database_tmp, mode="a") as f, pgzip.open( -2788 query_output_database_tmp, -2789 mode="a", -2790 thread=threads, -2791 compresslevel=compresslevel, -2792 ) as f_gz: -2793 -2794 # Switch to compressed stream file -2795 if compression_mode_gzip: -2796 f = f_gz -2797 f.write(b"]\n") -2798 else: -2799 f.write("]\n") -2800 -2801 # Export mode duckdb -2802 elif export_mode == "duckdb": -2803 -2804 # Create COPY TO query -2805 query_copy = f""" -2806 {query_set} -2807 COPY ( -2808 {query} -2809 ) -2810 TO '{query_output_database_tmp}' -2811 WITH ({query_export_format}) -2812 """ -2813 -2814 # Export with duckdb -2815 self.query(query=query_copy) -2816 -2817 # Export mode unknown -2818 else: -2819 log.error(f"Export mode '{export_mode}' unknown") -2820 raise ValueError(f"Export mode '{export_mode}' unknown") -2821 -2822 # Post process -2823 if post_process: -2824 -2825 # Log - number of records -2826 log.debug("Post processing...") -2827 -2828 # Input files -2829 input_files = [] -2830 -2831 # Export mode duckdb and include header -2832 if export_mode == "duckdb" and include_header: +2783 # Parquet in unique file +2784 else: +2785 writer.write_batch(d) +2786 +2787 # Close Parquet writer +2788 if ( +2789 export_options.get("format") in ["PARQUET"] +2790 and not export_options.get("partition_by", None) +2791 and not export_options.get("per_thread_output", None) +2792 ): +2793 writer.close() +2794 +2795 # JSON format - Add special "]" character at the end of the file +2796 if export_options.get("format") in ["JSON"]: +2797 +2798 # Open stream files (uncompressed and compressed) +2799 with open(query_output_database_tmp, mode="a") as f, pgzip.open( +2800 query_output_database_tmp, +2801 mode="a", +2802 thread=threads, +2803 compresslevel=compresslevel, +2804 ) as f_gz: +2805 +2806 # Switch to compressed stream file +2807 if compression_mode_gzip: +2808 f = f_gz +2809 f.write(b"]\n") +2810 else: +2811 f.write("]\n") +2812 +2813 # Export mode duckdb +2814 elif export_mode == "duckdb": +2815 +2816 # Create COPY TO query +2817 query_copy = f""" +2818 {query_set} +2819 COPY ( +2820 {query} +2821 ) +2822 TO '{query_output_database_tmp}' +2823 WITH ({query_export_format}) +2824 """ +2825 +2826 # Export with duckdb +2827 self.query(query=query_copy) +2828 +2829 # Export mode unknown +2830 else: +2831 log.error(f"Export mode '{export_mode}' unknown") +2832 raise ValueError(f"Export mode '{export_mode}' unknown") 2833 -2834 # create tmp header file -2835 query_output_header_tmp = os.path.join(tmp_dir, "header") -2836 tmp_files.append(query_output_header_tmp) -2837 self.get_header_file( -2838 header_file=query_output_header_tmp, remove_header_line=True -2839 ) -2840 -2841 # Add tmp header file for concat and compress -2842 input_files.append(query_output_header_tmp) -2843 -2844 # Add variants file -2845 input_files.append(query_output_database_tmp) -2846 -2847 # Output with concat and compress -2848 if not post_process_just_move and ( -2849 export_mode == "duckdb" -2850 or ( -2851 compressed -2852 and export_options.get("compression", None) == "bgzip" -2853 ) -2854 or sort -2855 or index -2856 ): -2857 -2858 # Compression type -2859 if not compressed: -2860 compression_type = "none" -2861 else: -2862 compression_type = export_options.get( -2863 "compression", "bgzip" -2864 ) -2865 -2866 # Concat and compress -2867 concat_and_compress_files( -2868 input_files=input_files, -2869 output_file=output_database, -2870 compression_type=compression_type, -2871 threads=threads, -2872 sort=sort, -2873 index=index, -2874 ) -2875 -2876 # Output already generated file (either compressed in gzip or not compressed, with included header if needed) -2877 else: -2878 -2879 # Move tmp file -2880 shutil.move(query_output_database_tmp, output_database) -2881 -2882 # Generate associated header file -2883 if output_header and export_header: -2884 -2885 # Log - Generate header -2886 log.debug("Generate header...") +2834 # Post process +2835 if post_process: +2836 +2837 # Log - number of records +2838 log.debug("Post processing...") +2839 +2840 # Input files +2841 input_files = [] +2842 +2843 # Export mode duckdb and include header +2844 if export_mode == "duckdb" and include_header: +2845 +2846 # create tmp header file +2847 query_output_header_tmp = os.path.join(tmp_dir, "header") +2848 tmp_files.append(query_output_header_tmp) +2849 self.get_header_file( +2850 header_file=query_output_header_tmp, remove_header_line=remove_header_line +2851 ) +2852 +2853 # Add tmp header file for concat and compress +2854 input_files.append(query_output_header_tmp) +2855 +2856 # Add variants file +2857 input_files.append(query_output_database_tmp) +2858 +2859 # Output with concat and compress +2860 if not post_process_just_move and ( +2861 export_mode == "duckdb" +2862 or ( +2863 compressed +2864 and export_options.get("compression", None) == "bgzip" +2865 ) +2866 or sort +2867 or index +2868 ): +2869 +2870 # Compression type +2871 if not compressed: +2872 compression_type = "none" +2873 else: +2874 compression_type = export_options.get( +2875 "compression", "bgzip" +2876 ) +2877 +2878 # Concat and compress +2879 concat_and_compress_files( +2880 input_files=input_files, +2881 output_file=output_database, +2882 compression_type=compression_type, +2883 threads=threads, +2884 sort=sort, +2885 index=index, +2886 ) 2887 -2888 # Create database -2889 database_for_header = Database(database=output_database) +2888 # Output already generated file (either compressed in gzip or not compressed, with included header if needed) +2889 else: 2890 -2891 # Remove header if exists -2892 remove_if_exists([output_header]) +2891 # Move tmp file +2892 shutil.move(query_output_database_tmp, output_database) 2893 -2894 # Find columns in database -2895 if schema_names: -2896 header_columns_from_database = schema_names -2897 else: -2898 header_columns_from_database = ( -2899 database_for_header.get_header_columns_from_database( -2900 database=output_database -2901 ) -2902 ) -2903 -2904 # Generate header file -2905 database_for_header.get_header_file( -2906 header_file=output_header, -2907 replace_header_line=header_columns_from_database, -2908 force=True, -2909 ) -2910 -2911 # Clean tmp files (deprecated) -2912 remove_if_exists(tmp_files) -2913 -2914 # Return if file exists -2915 return os.path.exists(output_database) and self.get_type(output_database) +2894 # Generate associated header file +2895 if output_header and export_header: +2896 +2897 # Log - Generate header +2898 log.debug("Generate header...") +2899 +2900 # Create database +2901 database_for_header = Database(database=output_database) +2902 +2903 # Remove header if exists +2904 remove_if_exists([output_header]) +2905 +2906 # Find columns in database +2907 if schema_names: +2908 header_columns_from_database = schema_names +2909 else: +2910 header_columns_from_database = ( +2911 database_for_header.get_header_columns_from_database( +2912 database=output_database +2913 ) +2914 ) +2915 +2916 # Generate header file +2917 database_for_header.get_header_file( +2918 header_file=output_header, +2919 replace_header_line=header_columns_from_database, +2920 force=True, +2921 ) +2922 +2923 # Clean tmp files (deprecated) +2924 remove_if_exists(tmp_files) +2925 +2926 # Return if file exists +2927 return os.path.exists(output_database) and self.get_type(output_database) @@ -9292,15 +9316,25 @@

    Returns
    1810 "bed", 1811 "json", 1812 ]: -1813 sql_from = self.get_sql_from( -1814 database=database, header_file=header_file -1815 ) -1816 sql_query = f"SELECT * FROM {sql_from} LIMIT 0" -1817 return list(self.conn.query(sql_query).columns) -1818 except ValueError: -1819 return [] -1820 -1821 return [] +1813 # Query +1814 sql_from = self.get_sql_from( +1815 database=database, header_file=header_file +1816 ) +1817 sql_query = f"SELECT * FROM {sql_from} LIMIT 0" +1818 +1819 # Get columns +1820 result_description = self.conn.execute(sql_query).description +1821 +1822 # Extract columns' names +1823 columns = [desc[0] for desc in result_description] +1824 +1825 # Return columns as list +1826 return columns +1827 +1828 except ValueError: +1829 return [] +1830 +1831 return [] @@ -9350,33 +9384,33 @@

    Returns
    -
    1823    def get_table_columns_from_format(self, database: str = None) -> list:
    -1824        """
    -1825        The function `get_table_columns_from_format` returns a list of table columns based on the
    -1826        specified database format.
    -1827
    -1828        :param database: The `database` parameter is a string that represents the name of the database.
    -1829        It is an optional parameter, which means it has a default value of `None`. If no value is
    -1830        provided for the `database` parameter, the `get_database()` method is called to retrieve the
    -1831        current database name
    -1832        :type database: str
    -1833        :return: a list of table columns.
    -1834        """
    -1835
    -1836        table_columns = None
    +            
    1833    def get_table_columns_from_format(self, database: str = None) -> list:
    +1834        """
    +1835        The function `get_table_columns_from_format` returns a list of table columns based on the
    +1836        specified database format.
     1837
    -1838        if not database:
    -1839            database = self.get_database()
    -1840
    -1841        database_format = self.get_format(database)
    -1842
    -1843        needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None)
    -1844        if needed_columns:
    -1845            table_columns = list(needed_columns.keys())
    -1846        else:
    -1847            table_columns = []
    -1848
    -1849        return table_columns
    +1838        :param database: The `database` parameter is a string that represents the name of the database.
    +1839        It is an optional parameter, which means it has a default value of `None`. If no value is
    +1840        provided for the `database` parameter, the `get_database()` method is called to retrieve the
    +1841        current database name
    +1842        :type database: str
    +1843        :return: a list of table columns.
    +1844        """
    +1845
    +1846        table_columns = None
    +1847
    +1848        if not database:
    +1849            database = self.get_database()
    +1850
    +1851        database_format = self.get_format(database)
    +1852
    +1853        needed_columns = DATABASE_TYPE_NEEDED_COLUMNS.get(database_format, None)
    +1854        if needed_columns:
    +1855            table_columns = list(needed_columns.keys())
    +1856        else:
    +1857            table_columns = []
    +1858
    +1859        return table_columns
     
    @@ -9412,81 +9446,81 @@
    Returns
    -
    1851    def get_table_columns_from_file(
    -1852        self,
    -1853        database: str = None,
    -1854        header_file: str = None,
    -1855        header_file_find: bool = True,
    -1856    ) -> list:
    -1857        """
    -1858        The function `get_table_columns_from_file` retrieves the column names from a database or header
    -1859        file.
    -1860
    -1861        :param database: The `database` parameter is a string that represents the name or path of the
    -1862        database file. If this parameter is not provided, the `get_database()` method is called to
    -1863        retrieve the database name or path
    -1864        :type database: str
    -1865        :param header_file: The `header_file` parameter is a string that represents the file path or
    -1866        name of the header file. This file contains the header information for a table, which typically
    -1867        includes the names of the columns in the table
    -1868        :type header_file: str
    -1869        :param header_file_find: Allow header file find if not provided
    -1870        :type header_file_find: bool
    -1871        :return: a list of table columns.
    -1872        """
    -1873
    -1874        table_columns = None
    -1875
    -1876        if not database:
    -1877            database = self.get_database()
    -1878
    -1879        if not header_file and header_file_find:
    -1880            header_file = self.get_header_file()
    -1881
    -1882        if not header_file and header_file_find:
    -1883            header_file = self.find_header_file(database)
    -1884
    -1885        database_format = self.get_format(database=database)
    -1886        delimiter = SEP_TYPE.get(database_format, "\t")
    -1887
    -1888        # Try from database file
    -1889        try:
    -1890            table_header = self.read_header_file(database)
    -1891        except ValueError:
    -1892            table_header = None
    -1893
    -1894        if table_header:
    -1895            try:
    -1896                table_columns = (
    -1897                    table_header[self.get_header_length(header_file=table_header)]
    -1898                    .strip()
    -1899                    .split(delimiter)
    -1900                )
    -1901            except IndexError:
    -1902                table_columns = None
    -1903        else:
    -1904            table_columns = None
    -1905
    -1906        if not table_columns:
    -1907            # Try from header file
    -1908            try:
    -1909                table_header = self.read_header_file(header_file)
    -1910            except ValueError:
    -1911                table_header = None
    -1912
    -1913            if table_header:
    -1914                try:
    -1915                    table_columns = (
    -1916                        table_header[self.get_header_length(header_file=table_header)]
    -1917                        .strip()
    -1918                        .split(delimiter)
    -1919                    )
    -1920                except IndexError:
    -1921                    table_columns = None
    -1922            else:
    -1923                table_columns = None
    -1924
    -1925        return table_columns
    +            
    1861    def get_table_columns_from_file(
    +1862        self,
    +1863        database: str = None,
    +1864        header_file: str = None,
    +1865        header_file_find: bool = True,
    +1866    ) -> list:
    +1867        """
    +1868        The function `get_table_columns_from_file` retrieves the column names from a database or header
    +1869        file.
    +1870
    +1871        :param database: The `database` parameter is a string that represents the name or path of the
    +1872        database file. If this parameter is not provided, the `get_database()` method is called to
    +1873        retrieve the database name or path
    +1874        :type database: str
    +1875        :param header_file: The `header_file` parameter is a string that represents the file path or
    +1876        name of the header file. This file contains the header information for a table, which typically
    +1877        includes the names of the columns in the table
    +1878        :type header_file: str
    +1879        :param header_file_find: Allow header file find if not provided
    +1880        :type header_file_find: bool
    +1881        :return: a list of table columns.
    +1882        """
    +1883
    +1884        table_columns = None
    +1885
    +1886        if not database:
    +1887            database = self.get_database()
    +1888
    +1889        if not header_file and header_file_find:
    +1890            header_file = self.get_header_file()
    +1891
    +1892        if not header_file and header_file_find:
    +1893            header_file = self.find_header_file(database)
    +1894
    +1895        database_format = self.get_format(database=database)
    +1896        delimiter = SEP_TYPE.get(database_format, "\t")
    +1897
    +1898        # Try from database file
    +1899        try:
    +1900            table_header = self.read_header_file(database)
    +1901        except ValueError:
    +1902            table_header = None
    +1903
    +1904        if table_header:
    +1905            try:
    +1906                table_columns = (
    +1907                    table_header[self.get_header_length(header_file=table_header)]
    +1908                    .strip()
    +1909                    .split(delimiter)
    +1910                )
    +1911            except IndexError:
    +1912                table_columns = None
    +1913        else:
    +1914            table_columns = None
    +1915
    +1916        if not table_columns:
    +1917            # Try from header file
    +1918            try:
    +1919                table_header = self.read_header_file(header_file)
    +1920            except ValueError:
    +1921                table_header = None
    +1922
    +1923            if table_header:
    +1924                try:
    +1925                    table_columns = (
    +1926                        table_header[self.get_header_length(header_file=table_header)]
    +1927                        .strip()
    +1928                        .split(delimiter)
    +1929                    )
    +1930                except IndexError:
    +1931                    table_columns = None
    +1932            else:
    +1933                table_columns = None
    +1934
    +1935        return table_columns
     
    @@ -9525,27 +9559,27 @@
    Returns
    -
    1927    def get_annotations(self, database: str = None) -> object:
    -1928        """
    -1929        This function returns the annotations of a database or the default database if none is
    -1930        specified.
    -1931
    -1932        :param database: The parameter `database` is a string that represents the name of the database
    -1933        to retrieve annotations from. If no database name is provided, the method will use the default
    -1934        database name obtained from the `get_database()` method
    -1935        :type database: str
    -1936        :return: The function `get_annotations` returns the `infos` attribute of the header of a
    -1937        database. If the `database` parameter is not provided, it gets the current database using the
    -1938        `get_database` method. If there is no header, it returns `None`.
    -1939        """
    -1940
    -1941        if not database:
    -1942            database = self.get_database()
    -1943
    -1944        if self.get_header(database=database):
    -1945            return self.get_header(database=database).infos
    -1946        else:
    -1947            return None
    +            
    1937    def get_annotations(self, database: str = None) -> object:
    +1938        """
    +1939        This function returns the annotations of a database or the default database if none is
    +1940        specified.
    +1941
    +1942        :param database: The parameter `database` is a string that represents the name of the database
    +1943        to retrieve annotations from. If no database name is provided, the method will use the default
    +1944        database name obtained from the `get_database()` method
    +1945        :type database: str
    +1946        :return: The function `get_annotations` returns the `infos` attribute of the header of a
    +1947        database. If the `database` parameter is not provided, it gets the current database using the
    +1948        `get_database` method. If there is no header, it returns `None`.
    +1949        """
    +1950
    +1951        if not database:
    +1952            database = self.get_database()
    +1953
    +1954        if self.get_header(database=database):
    +1955            return self.get_header(database=database).infos
    +1956        else:
    +1957            return None
     
    @@ -9582,54 +9616,54 @@
    Returns
    -
    1949    def get_extra_columns(
    -1950        self, database: str = None, database_type: str = None, sql_query: str = None
    -1951    ) -> list:
    -1952        """
    -1953        This Python function returns a list of extra columns in a database table that are not needed
    -1954        based on the database type and existing columns.
    -1955
    -1956        :param database: A string representing the name of the database to retrieve columns from. If
    -1957        None is provided, the default database will be used
    -1958        :type database: str
    -1959        :param database_type: The `database_type` parameter in the `get_extra_columns` function
    -1960        represents the type of the database for which you want to retrieve the list of extra columns. It
    -1961        is used to determine which columns are needed based on the database type and the existing
    -1962        columns in the specified database table
    -1963        :type database_type: str
    -1964        :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass
    -1965        an SQL query that can be used to retrieve specific columns from the database. This query can be
    -1966        customized to filter columns based on certain conditions or criteria before analyzing them to
    -1967        determine the extra columns that are not needed
    -1968        :type sql_query: str
    -1969        :return: A list of extra columns in a database table that are not needed based on the database
    -1970        type and existing columns.
    -1971        """
    -1972
    -1973        if not database:
    -1974            database = self.get_database()
    -1975
    -1976        if not database:
    -1977            return []
    -1978
    -1979        existing_columns = self.get_columns(
    -1980            database=database,
    -1981            table=self.get_database_table(database),
    -1982            sql_query=sql_query,
    -1983        )
    -1984        if not database_type:
    -1985            database_type = self.get_type(database=database, sql_query=sql_query)
    -1986        needed_columns = self.get_needed_columns(
    -1987            database_columns=existing_columns, database_type=database_type
    -1988        )
    -1989
    -1990        extra_columns = existing_columns.copy()
    -1991
    -1992        for needed_col in needed_columns:
    -1993            if needed_columns.get(needed_col) in extra_columns:
    -1994                extra_columns.remove(needed_columns.get(needed_col))
    -1995
    -1996        return extra_columns
    +            
    1959    def get_extra_columns(
    +1960        self, database: str = None, database_type: str = None, sql_query: str = None
    +1961    ) -> list:
    +1962        """
    +1963        This Python function returns a list of extra columns in a database table that are not needed
    +1964        based on the database type and existing columns.
    +1965
    +1966        :param database: A string representing the name of the database to retrieve columns from. If
    +1967        None is provided, the default database will be used
    +1968        :type database: str
    +1969        :param database_type: The `database_type` parameter in the `get_extra_columns` function
    +1970        represents the type of the database for which you want to retrieve the list of extra columns. It
    +1971        is used to determine which columns are needed based on the database type and the existing
    +1972        columns in the specified database table
    +1973        :type database_type: str
    +1974        :param sql_query: The `sql_query` parameter in the `get_extra_columns` function is used to pass
    +1975        an SQL query that can be used to retrieve specific columns from the database. This query can be
    +1976        customized to filter columns based on certain conditions or criteria before analyzing them to
    +1977        determine the extra columns that are not needed
    +1978        :type sql_query: str
    +1979        :return: A list of extra columns in a database table that are not needed based on the database
    +1980        type and existing columns.
    +1981        """
    +1982
    +1983        if not database:
    +1984            database = self.get_database()
    +1985
    +1986        if not database:
    +1987            return []
    +1988
    +1989        existing_columns = self.get_columns(
    +1990            database=database,
    +1991            table=self.get_database_table(database),
    +1992            sql_query=sql_query,
    +1993        )
    +1994        if not database_type:
    +1995            database_type = self.get_type(database=database, sql_query=sql_query)
    +1996        needed_columns = self.get_needed_columns(
    +1997            database_columns=existing_columns, database_type=database_type
    +1998        )
    +1999
    +2000        extra_columns = existing_columns.copy()
    +2001
    +2002        for needed_col in needed_columns:
    +2003            if needed_columns.get(needed_col) in extra_columns:
    +2004                extra_columns.remove(needed_columns.get(needed_col))
    +2005
    +2006        return extra_columns
     
    @@ -9672,40 +9706,40 @@
    Returns
    -
    1998    def is_vcf(self, database: str = None, sql_query: str = None) -> bool:
    -1999        """
    -2000        The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and
    -2001        their types.
    -2002
    -2003        :param database: The `database` parameter in the `is_vcf` function is a string that represents
    -2004        the name of the database that the function will use to check if the file is a VCF (Variant Call
    -2005        Format) file. If the `database` parameter is not provided when calling the function, it will
    -2006        :type database: str
    -2007        :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL
    -2008        query string that can be used to filter the columns retrieved from the database. This query can
    -2009        be used to narrow down the columns that are considered when checking if the database is of type
    -2010        "vcf"
    -2011        :type sql_query: str
    -2012        :return: The function `is_vcf` returns a boolean value indicating whether the database type is
    -2013        "vcf" or not.
    -2014        """
    -2015
    -2016        if not database:
    -2017            database = self.get_database()
    -2018
    -2019        if not database:
    -2020            return False
    -2021
    -2022        database_columns = self.get_columns(
    -2023            database=database,
    -2024            table=self.get_database_table(database),
    -2025            sql_query=sql_query,
    -2026        )
    -2027
    -2028        # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples)
    -2029        return self.get_type_from_columns(
    -2030            database_columns=database_columns, check_database_type="vcf"
    -2031        ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8)
    +            
    2008    def is_vcf(self, database: str = None, sql_query: str = None) -> bool:
    +2009        """
    +2010        The `is_vcf` function checks if a given database is of type "vcf" by examining its columns and
    +2011        their types.
    +2012
    +2013        :param database: The `database` parameter in the `is_vcf` function is a string that represents
    +2014        the name of the database that the function will use to check if the file is a VCF (Variant Call
    +2015        Format) file. If the `database` parameter is not provided when calling the function, it will
    +2016        :type database: str
    +2017        :param sql_query: The `sql_query` parameter in the `is_vcf` function is used to pass an SQL
    +2018        query string that can be used to filter the columns retrieved from the database. This query can
    +2019        be used to narrow down the columns that are considered when checking if the database is of type
    +2020        "vcf"
    +2021        :type sql_query: str
    +2022        :return: The function `is_vcf` returns a boolean value indicating whether the database type is
    +2023        "vcf" or not.
    +2024        """
    +2025
    +2026        if not database:
    +2027            database = self.get_database()
    +2028
    +2029        if not database:
    +2030            return False
    +2031
    +2032        database_columns = self.get_columns(
    +2033            database=database,
    +2034            table=self.get_database_table(database),
    +2035            sql_query=sql_query,
    +2036        )
    +2037
    +2038        # Assume VCF is 8 needed columns, either only or with extra FORMAT column (assume other are samples)
    +2039        return self.get_type_from_columns(
    +2040            database_columns=database_columns, check_database_type="vcf"
    +2041        ) == "vcf" and ("FORMAT" in database_columns or len(database_columns) == 8)
     
    @@ -9745,13 +9779,13 @@
    Returns
    -
    2033    def get_conn(self):
    -2034        """
    -2035        The function returns the connection object.
    -2036        :return: The method is returning the value of the instance variable `self.conn`.
    -2037        """
    -2038
    -2039        return self.conn
    +            
    2043    def get_conn(self):
    +2044        """
    +2045        The function returns the connection object.
    +2046        :return: The method is returning the value of the instance variable `self.conn`.
    +2047        """
    +2048
    +2049        return self.conn
     
    @@ -9777,79 +9811,79 @@
    Returns
    -
    2041    def is_genotype_column(
    -2042        self,
    -2043        column: str,
    -2044        database: str = None,
    -2045        downsampling: int = 1000,
    -2046        check_format: bool = True,
    -2047    ) -> bool:
    -2048        """
    -2049        The `is_genotype_column` function in Python checks if a specified column in a database contains
    -2050        genotype data based on a regular expression pattern.
    -2051
    -2052        :param column: The `column` parameter is a string that represents the name of a column in a
    -2053        database table. It is used to specify the column for which you want to check if it contains
    -2054        genotype information based on a regular expression pattern
    -2055        :type column: str
    -2056        :param database: The `database` parameter in the `is_genotype_column` method is used to specify
    -2057        the name of the database from which the data will be queried. If a database is provided, the
    -2058        method will query the specified database to check if the given column contains genotype
    -2059        information. If no database is provided,
    -2060        :type database: str
    -2061        :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an
    -2062        integer value that determines the number of rows to be sampled from the database table when
    -2063        checking for genotype information in the specified column. This parameter is used to limit the
    -2064        number of rows to be processed in order to improve performance, defaults to 1000
    -2065        :type downsampling: int (optional)
    -2066        :param check_format: The `check_format` parameter in the `is_genotype_column` method is a
    -2067        boolean flag that determines whether the function should check the format of the data before
    -2068        proceeding with the genotype column analysis. If `check_format` is set to `True`, the function
    -2069        will verify if the specified column exists in, defaults to True
    -2070        :type check_format: bool (optional)
    -2071        :return: The `is_genotype_column` method returns a boolean value. If the specified column in a
    -2072        database table contains genotype information, it returns `True`; otherwise, it returns `False`.
    -2073        """
    -2074
    -2075        # Table variants
    -2076        table_variants_from = self.get_sql_database_link(database=database)
    -2077
    -2078        # Check if format column is present
    -2079        if check_format:
    -2080            query = f"""
    -2081                SELECT * FROM {table_variants_from}
    -2082                LIMIT 0
    -2083            """
    -2084            df = self.query(query=query)
    -2085            if "FORMAT" not in df.columns or column not in df.columns:
    -2086                return False
    -2087            query_format = f"""
    -2088                AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$'))
    -2089            """
    -2090        else:
    -2091            query_format = ""
    -2092
    -2093        # Query number of samples
    -2094        query_downsampling = f"""
    -2095            SELECT "{column}", FORMAT
    -2096            FROM {table_variants_from}
    -2097            LIMIT {downsampling}
    -2098        """
    -2099        df_downsampling = self.query(query=query_downsampling)
    -2100
    -2101        # Query to check genotype
    -2102        query_genotype = f"""
    -2103            SELECT  *
    -2104            FROM df_downsampling
    -2105            WHERE (
    -2106                regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*')
    -2107                {query_format}
    -2108                )
    -2109        """
    -2110        df_genotype = self.query(query=query_genotype)
    -2111
    -2112        # return
    -2113        return len(df_genotype) == len(df_downsampling)
    +            
    2051    def is_genotype_column(
    +2052        self,
    +2053        column: str,
    +2054        database: str = None,
    +2055        downsampling: int = 1000,
    +2056        check_format: bool = True,
    +2057    ) -> bool:
    +2058        """
    +2059        The `is_genotype_column` function in Python checks if a specified column in a database contains
    +2060        genotype data based on a regular expression pattern.
    +2061
    +2062        :param column: The `column` parameter is a string that represents the name of a column in a
    +2063        database table. It is used to specify the column for which you want to check if it contains
    +2064        genotype information based on a regular expression pattern
    +2065        :type column: str
    +2066        :param database: The `database` parameter in the `is_genotype_column` method is used to specify
    +2067        the name of the database from which the data will be queried. If a database is provided, the
    +2068        method will query the specified database to check if the given column contains genotype
    +2069        information. If no database is provided,
    +2070        :type database: str
    +2071        :param downsampling: The `downsampling` parameter in the `is_genotype_column` method is an
    +2072        integer value that determines the number of rows to be sampled from the database table when
    +2073        checking for genotype information in the specified column. This parameter is used to limit the
    +2074        number of rows to be processed in order to improve performance, defaults to 1000
    +2075        :type downsampling: int (optional)
    +2076        :param check_format: The `check_format` parameter in the `is_genotype_column` method is a
    +2077        boolean flag that determines whether the function should check the format of the data before
    +2078        proceeding with the genotype column analysis. If `check_format` is set to `True`, the function
    +2079        will verify if the specified column exists in, defaults to True
    +2080        :type check_format: bool (optional)
    +2081        :return: The `is_genotype_column` method returns a boolean value. If the specified column in a
    +2082        database table contains genotype information, it returns `True`; otherwise, it returns `False`.
    +2083        """
    +2084
    +2085        # Table variants
    +2086        table_variants_from = self.get_sql_database_link(database=database)
    +2087
    +2088        # Check if format column is present
    +2089        if check_format:
    +2090            query = f"""
    +2091                SELECT * FROM {table_variants_from}
    +2092                LIMIT 0
    +2093            """
    +2094            df = self.query(query=query)
    +2095            if "FORMAT" not in df.columns or column not in df.columns:
    +2096                return False
    +2097            query_format = f"""
    +2098                AND (len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{column}" AS VARCHAR), ':')) OR regexp_matches(CAST("{column}" AS VARCHAR), '^[.]([/|][.])*$'))
    +2099            """
    +2100        else:
    +2101            query_format = ""
    +2102
    +2103        # Query number of samples
    +2104        query_downsampling = f"""
    +2105            SELECT "{column}", FORMAT
    +2106            FROM {table_variants_from}
    +2107            LIMIT {downsampling}
    +2108        """
    +2109        df_downsampling = self.query(query=query_downsampling)
    +2110
    +2111        # Query to check genotype
    +2112        query_genotype = f"""
    +2113            SELECT  *
    +2114            FROM df_downsampling
    +2115            WHERE (
    +2116                regexp_matches(CAST("{column}" AS VARCHAR), '^[0-9.]([/|][0-9.])*')
    +2117                {query_format}
    +2118                )
    +2119        """
    +2120        df_genotype = self.query(query=query_genotype)
    +2121
    +2122        # return
    +2123        return len(df_genotype) == len(df_downsampling)
     
    @@ -9897,807 +9931,809 @@
    Returns
    -
    2115    def export(
    -2116        self,
    -2117        output_database: str,
    -2118        output_header: str = None,
    -2119        header_in_output: bool = True,
    -2120        database: str = None,
    -2121        table: str = "variants",
    -2122        parquet_partitions: list = None,
    -2123        threads: int = 1,
    -2124        sort: bool = False,
    -2125        index: bool = False,
    -2126        existing_columns_header: list = [],
    -2127        order_by: str = "",
    -2128        query: str = None,
    -2129        compression_type: str = None,
    -2130        chunk_size: int = 1000000,
    -2131        export_mode: str = "pyarrow",
    -2132        compresslevel: int = 6,
    -2133        export_header: bool = True,
    -2134        sample_list: list = None,
    -2135    ) -> bool:
    -2136        """
    -2137        The `export` function exports data from a database to a specified output format, compresses it
    -2138        if necessary, and returns a boolean value indicating whether the export was successful or not.
    -2139
    -2140        :param output_database: The `output_database` parameter is a string that represents the path and
    -2141        filename of the output file to be exported. It specifies where the exported data will be saved
    -2142        :type output_database: str
    -2143        :param output_header: The `output_header` parameter is an optional string that represents the
    -2144        header of the output file. If provided, it specifies the header that will be included in the
    -2145        output file. If not provided, the header will be automatically detected based on the output file
    -2146        format
    -2147        :type output_header: str
    -2148        :param header_in_output: The `header_in_output` parameter is a boolean value that determines
    -2149        whether the header should be included in the output file. If set to `True`, the header will be
    -2150        included in the output file. If set to `False`, the header will not be included in the output
    -2151        file. By default,, defaults to True
    -2152        :type header_in_output: bool (optional)
    -2153        :param database: The `database` parameter is the name of the database from which you want to
    -2154        export data. If this parameter is not provided, the function will use the `get_database()`
    -2155        method to retrieve the current database
    -2156        :type database: str
    -2157        :param table: The `table` parameter specifies the name of the table in the database from which
    -2158        the data will be exported. By default, if not specified, it is set to "variants", defaults to
    -2159        variants
    -2160        :type table: str (optional)
    -2161        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
    -2162        partition columns for the Parquet output format. Each element in the list represents a partition
    -2163        column. The partitions are used to organize the data in the Parquet file based on the values of
    -2164        the specified columns
    -2165        :type parquet_partitions: list
    -2166        :param threads: The `threads` parameter in the `export` function is an optional integer that
    -2167        specifies the number of threads to use for exporting the data. It determines the level of
    -2168        parallelism during the export process. By default, it is set to 1, defaults to 1
    -2169        :type threads: int (optional)
    -2170        :param sort: The `sort` parameter in the `export` function is a boolean value that specifies
    -2171        whether the output file should be sorted based on the genomic coordinates of the variants. If
    -2172        `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults
    -2173        to False
    -2174        :type sort: bool (optional)
    -2175        :param index: The `index` parameter is a boolean value that specifies whether to index the
    -2176        output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to
    -2177        `False` or not provided, the output file will not be indexed. By default,, defaults to False
    -2178        :type index: bool (optional)
    -2179        :param existing_columns_header: The `existing_columns_header` parameter is a list that
    -2180        represents the existing columns in the header of the output file. It is used to determine the
    -2181        columns that should be included in the output file. If this parameter is not provided, the
    -2182        function will automatically detect the header columns based on the output file format
    -2183        :type existing_columns_header: list
    -2184        :param order_by: The `order_by` parameter in the `export` function is a string that specifies
    -2185        the columns by which the output file should be ordered. You can specify multiple columns
    -2186        separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC"
    -2187        (descending) to specify
    -2188        :type order_by: str
    -2189        :param query: The `query` parameter in the `export` function represents a SQL query that
    -2190        specifies the data to be exported from the database. If provided, the function will export the
    -2191        result of this query. If the `query` parameter is not provided, the function will generate a
    -2192        query to export the data from
    -2193        :type query: str
    -2194        :param compression_type: The `compression_type` parameter in the `export` function specifies the
    -2195        type of compression to be applied to the output file. By default, the compression type is set to
    -2196        "bgzip". This parameter allows you to choose the compression algorithm for the output file, such
    -2197        as "gzip", "bgzip
    -2198        :type compression_type: str
    -2199        :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of
    -2200        each chunk or batch of data that will be processed during the export operation. It determines
    -2201        how many records or lines of data will be included in each chunk that is processed at a time,
    -2202        defaults to 1000000
    -2203        :type chunk_size: int (optional)
    -2204        :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of
    -2205        export, which can be either "pyarrow" or "duckdb", defaults to pyarrow
    -2206        :type export_mode: str (optional)
    -2207        :param compresslevel: The `compresslevel` parameter in the `export` function represents the
    -2208        level of compression for gzip. By default, it is set to 6. This parameter allows you to specify
    -2209        the compression level when using gzip compression for the output file. The compression level can
    -2210        range from 0 (no compression), defaults to 6
    -2211        :type compresslevel: int (optional)
    -2212        :param export_header: The `export_header` parameter is a boolean flag that determines whether
    -2213        the header of a VCF file should be exported to a separate file or not. If `export_header` is
    -2214        True, the header will be exported to a file. If `export_header` is False, the header will not
    -2215        be, defaults to True
    -2216        :type export_header: bool (optional)
    -2217        :param sample_list: The `sample_list` parameter in the `export` function is a list that
    -2218        specifies the samples to be included in the exported data. If provided, the samples listed in
    -2219        this parameter will be included in the output file. If not provided, the function will determine
    -2220        the samples to include based on the data
    -2221        :type sample_list: list
    -2222        :return: The `export` function returns a boolean value indicating whether the export was
    -2223        successful or not.
    -2224        """
    -2225
    -2226        # Full path
    -2227        output_database = full_path(output_database)
    -2228        output_header = full_path(output_header)
    -2229        database = full_path(database)
    -2230
    -2231        # Database
    -2232        if not database:
    -2233            database = self.get_database()
    -2234        if not database:
    -2235            return False
    -2236
    -2237        # Chunk size
    -2238        if not chunk_size:
    -2239            chunk_size = 1000000
    -2240        else:
    -2241            chunk_size = int(chunk_size)
    -2242
    -2243        # Export mode
    -2244        # Either "pyarrow" (default) or "duckdb"
    -2245        if not export_mode:
    -2246            export_mode = "pyarrow"
    -2247
    -2248        # Compression level
    -2249        if not compresslevel:
    -2250            compresslevel = 6
    -2251
    -2252        # Remove output if exists
    -2253        remove_if_exists(output_database)
    -2254
    -2255        # Tmp
    -2256        tmp_folder = os.path.dirname(output_database)
    -2257        if not tmp_folder:
    -2258            tmp_folder = "."
    -2259
    -2260        with TemporaryDirectory(
    -2261            dir=tmp_folder, prefix="howard_database_export_"
    -2262        ) as tmp_dir:
    -2263
    -2264            # tmp files
    -2265            tmp_files = []
    -2266
    -2267            # query_set
    -2268            query_set = ""
    +            
    2125    def export(
    +2126        self,
    +2127        output_database: str,
    +2128        output_header: str = None,
    +2129        header_in_output: bool = True,
    +2130        database: str = None,
    +2131        table: str = "variants",
    +2132        parquet_partitions: list = None,
    +2133        threads: int = 1,
    +2134        sort: bool = False,
    +2135        index: bool = False,
    +2136        existing_columns_header: list = [],
    +2137        order_by: str = "",
    +2138        query: str = None,
    +2139        compression_type: str = None,
    +2140        chunk_size: int = 1000000,
    +2141        export_mode: str = "pyarrow",
    +2142        compresslevel: int = 6,
    +2143        export_header: bool = True,
    +2144        sample_list: list = None,
    +2145    ) -> bool:
    +2146        """
    +2147        The `export` function exports data from a database to a specified output format, compresses it
    +2148        if necessary, and returns a boolean value indicating whether the export was successful or not.
    +2149
    +2150        :param output_database: The `output_database` parameter is a string that represents the path and
    +2151        filename of the output file to be exported. It specifies where the exported data will be saved
    +2152        :type output_database: str
    +2153        :param output_header: The `output_header` parameter is an optional string that represents the
    +2154        header of the output file. If provided, it specifies the header that will be included in the
    +2155        output file. If not provided, the header will be automatically detected based on the output file
    +2156        format
    +2157        :type output_header: str
    +2158        :param header_in_output: The `header_in_output` parameter is a boolean value that determines
    +2159        whether the header should be included in the output file. If set to `True`, the header will be
    +2160        included in the output file. If set to `False`, the header will not be included in the output
    +2161        file. By default,, defaults to True
    +2162        :type header_in_output: bool (optional)
    +2163        :param database: The `database` parameter is the name of the database from which you want to
    +2164        export data. If this parameter is not provided, the function will use the `get_database()`
    +2165        method to retrieve the current database
    +2166        :type database: str
    +2167        :param table: The `table` parameter specifies the name of the table in the database from which
    +2168        the data will be exported. By default, if not specified, it is set to "variants", defaults to
    +2169        variants
    +2170        :type table: str (optional)
    +2171        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
    +2172        partition columns for the Parquet output format. Each element in the list represents a partition
    +2173        column. The partitions are used to organize the data in the Parquet file based on the values of
    +2174        the specified columns
    +2175        :type parquet_partitions: list
    +2176        :param threads: The `threads` parameter in the `export` function is an optional integer that
    +2177        specifies the number of threads to use for exporting the data. It determines the level of
    +2178        parallelism during the export process. By default, it is set to 1, defaults to 1
    +2179        :type threads: int (optional)
    +2180        :param sort: The `sort` parameter in the `export` function is a boolean value that specifies
    +2181        whether the output file should be sorted based on the genomic coordinates of the variants. If
    +2182        `sort` is set to `True`, the output file will be sorted. If `sort` is set to `False`,, defaults
    +2183        to False
    +2184        :type sort: bool (optional)
    +2185        :param index: The `index` parameter is a boolean value that specifies whether to index the
    +2186        output file. If `index` is set to `True`, the output file will be indexed. If `index` is set to
    +2187        `False` or not provided, the output file will not be indexed. By default,, defaults to False
    +2188        :type index: bool (optional)
    +2189        :param existing_columns_header: The `existing_columns_header` parameter is a list that
    +2190        represents the existing columns in the header of the output file. It is used to determine the
    +2191        columns that should be included in the output file. If this parameter is not provided, the
    +2192        function will automatically detect the header columns based on the output file format
    +2193        :type existing_columns_header: list
    +2194        :param order_by: The `order_by` parameter in the `export` function is a string that specifies
    +2195        the columns by which the output file should be ordered. You can specify multiple columns
    +2196        separated by commas. Each column can be followed by the keyword "ASC" (ascending) or "DESC"
    +2197        (descending) to specify
    +2198        :type order_by: str
    +2199        :param query: The `query` parameter in the `export` function represents a SQL query that
    +2200        specifies the data to be exported from the database. If provided, the function will export the
    +2201        result of this query. If the `query` parameter is not provided, the function will generate a
    +2202        query to export the data from
    +2203        :type query: str
    +2204        :param compression_type: The `compression_type` parameter in the `export` function specifies the
    +2205        type of compression to be applied to the output file. By default, the compression type is set to
    +2206        "bgzip". This parameter allows you to choose the compression algorithm for the output file, such
    +2207        as "gzip", "bgzip
    +2208        :type compression_type: str
    +2209        :param chunk_size: The `chunk_size` parameter in the `export` function specifies the size of
    +2210        each chunk or batch of data that will be processed during the export operation. It determines
    +2211        how many records or lines of data will be included in each chunk that is processed at a time,
    +2212        defaults to 1000000
    +2213        :type chunk_size: int (optional)
    +2214        :param export_mode: The `export_mode` parameter in the `export` function specifies the mode of
    +2215        export, which can be either "pyarrow" or "duckdb", defaults to pyarrow
    +2216        :type export_mode: str (optional)
    +2217        :param compresslevel: The `compresslevel` parameter in the `export` function represents the
    +2218        level of compression for gzip. By default, it is set to 6. This parameter allows you to specify
    +2219        the compression level when using gzip compression for the output file. The compression level can
    +2220        range from 0 (no compression), defaults to 6
    +2221        :type compresslevel: int (optional)
    +2222        :param export_header: The `export_header` parameter is a boolean flag that determines whether
    +2223        the header of a VCF file should be exported to a separate file or not. If `export_header` is
    +2224        True, the header will be exported to a file. If `export_header` is False, the header will not
    +2225        be, defaults to True
    +2226        :type export_header: bool (optional)
    +2227        :param sample_list: The `sample_list` parameter in the `export` function is a list that
    +2228        specifies the samples to be included in the exported data. If provided, the samples listed in
    +2229        this parameter will be included in the output file. If not provided, the function will determine
    +2230        the samples to include based on the data
    +2231        :type sample_list: list
    +2232        :return: The `export` function returns a boolean value indicating whether the export was
    +2233        successful or not.
    +2234        """
    +2235
    +2236        # Full path
    +2237        output_database = full_path(output_database)
    +2238        output_header = full_path(output_header)
    +2239        database = full_path(database)
    +2240
    +2241        # Database
    +2242        if not database:
    +2243            database = self.get_database()
    +2244        if not database:
    +2245            return False
    +2246
    +2247        # Chunk size
    +2248        if not chunk_size:
    +2249            chunk_size = 1000000
    +2250        else:
    +2251            chunk_size = int(chunk_size)
    +2252
    +2253        # Export mode
    +2254        # Either "pyarrow" (default) or "duckdb"
    +2255        if not export_mode:
    +2256            export_mode = "pyarrow"
    +2257
    +2258        # Compression level
    +2259        if not compresslevel:
    +2260            compresslevel = 6
    +2261
    +2262        # Remove output if exists
    +2263        remove_if_exists(output_database)
    +2264
    +2265        # Tmp
    +2266        tmp_folder = os.path.dirname(output_database)
    +2267        if not tmp_folder:
    +2268            tmp_folder = "."
     2269
    -2270            # Header columns
    -2271            if not existing_columns_header and output_header:
    -2272                existing_columns_header = self.get_header_file_columns(output_header)
    +2270        with TemporaryDirectory(
    +2271            dir=tmp_folder, prefix="howard_database_export_"
    +2272        ) as tmp_dir:
     2273
    -2274            # Auto-detect output type and compression and delimiter
    -2275            output_type = get_file_format(output_database)
    -2276            compressed = self.is_compressed(database=output_database)
    -2277            delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t")
    -2278
    -2279            # database type
    -2280            if output_type in ["vcf"]:
    -2281                database_type = "vcf"
    -2282            elif output_type in ["bed"]:
    -2283                database_type = "regions"
    -2284            else:
    -2285                database_type = self.get_type(database=database, sql_query=query)
    -2286
    -2287            # database object
    -2288            # If database is string, then create database conn
    -2289            if isinstance(database, str):
    -2290                database_conn = Database(database=database).get_conn()
    -2291            else:
    -2292                database_conn = database
    -2293
    -2294            # Existing columns
    -2295            existing_columns = self.get_columns(
    -2296                database=database_conn,
    -2297                table=self.get_database_table(database=database),
    -2298                sql_query=query,
    -2299            )
    -2300
    -2301            # Extra columns
    -2302            extra_columns = self.get_extra_columns(
    -2303                database=database_conn, database_type=output_type, sql_query=query
    -2304            )
    -2305
    -2306            # Needed columns
    -2307            needed_columns = self.get_needed_columns(
    -2308                database_columns=existing_columns, database_type=database_type
    +2274            # tmp files
    +2275            tmp_files = []
    +2276
    +2277            # query_set
    +2278            query_set = ""
    +2279
    +2280            # Header columns
    +2281            if not existing_columns_header and output_header:
    +2282                existing_columns_header = self.get_header_file_columns(output_header)
    +2283
    +2284            # Auto-detect output type and compression and delimiter
    +2285            output_type = get_file_format(output_database)
    +2286            compressed = self.is_compressed(database=output_database)
    +2287            delimiter = FILE_FORMAT_DELIMITERS.get(output_type, "\t")
    +2288
    +2289            # database type
    +2290            if output_type in ["vcf"]:
    +2291                database_type = "vcf"
    +2292            elif output_type in ["bed"]:
    +2293                database_type = "regions"
    +2294            else:
    +2295                database_type = self.get_type(database=database, sql_query=query)
    +2296
    +2297            # database object
    +2298            # If database is string, then create database conn
    +2299            if isinstance(database, str):
    +2300                database_conn = Database(database=database).get_conn()
    +2301            else:
    +2302                database_conn = database
    +2303
    +2304            # Existing columns
    +2305            existing_columns = self.get_columns(
    +2306                database=database_conn,
    +2307                table=self.get_database_table(database=database),
    +2308                sql_query=query,
     2309            )
     2310
    -2311            # Order by
    -2312            order_by_list = []
    -2313            if order_by:
    -2314                # Split order by options
    -2315                order_by_split = order_by.split(",")
    -2316                for order_by_option in order_by_split:
    -2317                    # Split order by option
    -2318                    order_by_option_split = order_by_option.strip().split(" ")
    -2319                    order_by_option_split_column = order_by_option_split[0]
    -2320                    if len(order_by_option_split) > 1:
    -2321                        order_by_option_split_order = order_by_option_split[1]
    -2322                    else:
    -2323                        order_by_option_split_order = "ASC"
    -2324                    # Chek if column exists
    -2325                    if (
    -2326                        order_by_option_split_column.replace('"', "").strip()
    -2327                        in existing_columns
    -2328                    ):
    -2329                        order_by_list.append(
    -2330                            f"{order_by_option_split_column} {order_by_option_split_order}"
    -2331                        )
    -2332
    -2333            # Clean order by
    -2334            order_by_clean = ", ".join(order_by_list)
    -2335
    -2336            # Query values
    -2337            default_empty_value = ""
    -2338            query_export_format = None
    -2339            include_header = False
    -2340            post_process = False
    -2341            order_by_sql = ""
    -2342            post_process_just_move = False
    -2343
    -2344            # export options
    -2345            export_options = {}
    -2346
    -2347            # VCF
    -2348            if output_type in ["vcf"]:
    -2349                if not self.is_vcf(database=database_conn, sql_query=query):
    -2350                    extra_columns = []
    -2351                else:
    -2352                    extra_columns = existing_columns_header
    +2311            # Extra columns
    +2312            extra_columns = self.get_extra_columns(
    +2313                database=database_conn, database_type=output_type, sql_query=query
    +2314            )
    +2315
    +2316            # Needed columns
    +2317            needed_columns = self.get_needed_columns(
    +2318                database_columns=existing_columns, database_type=database_type
    +2319            )
    +2320
    +2321            # Order by
    +2322            order_by_list = []
    +2323            if order_by:
    +2324                # Split order by options
    +2325                order_by_split = order_by.split(",")
    +2326                for order_by_option in order_by_split:
    +2327                    # Split order by option
    +2328                    order_by_option_split = order_by_option.strip().split(" ")
    +2329                    order_by_option_split_column = order_by_option_split[0]
    +2330                    if len(order_by_option_split) > 1:
    +2331                        order_by_option_split_order = order_by_option_split[1]
    +2332                    else:
    +2333                        order_by_option_split_order = "ASC"
    +2334                    # Chek if column exists
    +2335                    if (
    +2336                        order_by_option_split_column.replace('"', "").strip()
    +2337                        in existing_columns
    +2338                    ):
    +2339                        order_by_list.append(
    +2340                            f"{order_by_option_split_column} {order_by_option_split_order}"
    +2341                        )
    +2342
    +2343            # Clean order by
    +2344            order_by_clean = ", ".join(order_by_list)
    +2345
    +2346            # Query values
    +2347            default_empty_value = ""
    +2348            query_export_format = None
    +2349            include_header = False
    +2350            post_process = False
    +2351            order_by_sql = ""
    +2352            post_process_just_move = False
     2353
    -2354                # Check VCF format with extra columns
    -2355                extra_columns_clean = []
    +2354            # export options
    +2355            export_options = {}
     2356
    -2357                # Force samples list in parameter
    -2358                if sample_list:
    -2359                    if "FORMAT" in extra_columns:
    -2360                        extra_columns = ["FORMAT"] + sample_list
    -2361                    else:
    -2362                        extra_columns = sample_list
    +2357            # VCF
    +2358            if output_type in ["vcf"]:
    +2359                if not self.is_vcf(database=database_conn, sql_query=query):
    +2360                    extra_columns = []
    +2361                else:
    +2362                    extra_columns = existing_columns_header
     2363
    -2364                # Check columns
    -2365                else:
    -2366                    for extra_column in extra_columns:
    -2367                        if extra_column not in needed_columns and (
    -2368                            extra_column == "FORMAT"
    -2369                            or (
    -2370                                "FORMAT" in extra_columns_clean
    -2371                                and self.is_genotype_column(
    -2372                                    database=database, column=extra_column
    -2373                                )
    -2374                            )
    -2375                        ):
    -2376                            extra_columns_clean.append(extra_column)
    -2377                    extra_columns = extra_columns_clean
    -2378
    -2379                default_empty_value = "."
    -2380                query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'"
    -2381                include_header = True
    -2382                post_process = True
    -2383                # Export options
    -2384                if not compression_type:
    -2385                    compression_type = "bgzip"
    -2386                export_options = {
    -2387                    "format": "CSV",
    -2388                    "delimiter": delimiter,
    -2389                    "header": True,
    -2390                    "quote": None,
    -2391                    "compression": compression_type,
    -2392                }
    -2393                compresslevel = 1
    -2394
    -2395            # TSV/CSV/TBL
    -2396            elif output_type in ["tsv", "csv", "tbl"]:
    -2397                if output_type in ["csv", "tbl"]:
    -2398                    quote = '"'
    -2399                else:
    -2400                    quote = ""
    -2401                query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'"
    -2402                if delimiter in ["\t"]:
    -2403                    include_header = header_in_output and True
    -2404                post_process = True
    -2405                if order_by_clean:
    -2406                    order_by_sql = f"ORDER BY {order_by_clean}"
    -2407                # Export options
    -2408                if not compression_type:
    -2409                    compression_type = "gzip"
    -2410                export_options = {
    -2411                    "format": "CSV",
    -2412                    "delimiter": delimiter,
    -2413                    "header": True,
    -2414                    "quote": quote,
    -2415                    "compression": compression_type,
    -2416                }
    -2417
    -2418            # JSON
    -2419            elif output_type in ["json"]:
    -2420                query_export_format = "FORMAT JSON, ARRAY TRUE"
    -2421                include_header = False
    -2422                post_process = True
    -2423                if order_by_clean:
    -2424                    order_by_sql = f"ORDER BY {order_by_clean}"
    -2425                # Export options
    -2426                if not compression_type:
    -2427                    compression_type = "gzip"
    -2428                export_options = {
    -2429                    "format": "JSON",
    -2430                    "array": True,
    -2431                    "compression": compression_type,
    -2432                }
    -2433
    -2434            # Parquet
    -2435            elif output_type in ["parquet"]:
    -2436                query_export_format = "FORMAT PARQUET"
    -2437                # Export options
    +2364                # Check VCF format with extra columns
    +2365                extra_columns_clean = []
    +2366
    +2367                # Force samples list in parameter
    +2368                if sample_list:
    +2369                    if "FORMAT" in extra_columns:
    +2370                        extra_columns = ["FORMAT"] + sample_list
    +2371                    else:
    +2372                        extra_columns = sample_list
    +2373
    +2374                # Check columns
    +2375                else:
    +2376                    for extra_column in extra_columns:
    +2377                        if extra_column not in needed_columns and (
    +2378                            extra_column == "FORMAT"
    +2379                            or (
    +2380                                "FORMAT" in extra_columns_clean
    +2381                                and self.is_genotype_column(
    +2382                                    database=database, column=extra_column
    +2383                                )
    +2384                            )
    +2385                        ):
    +2386                            extra_columns_clean.append(extra_column)
    +2387                    extra_columns = extra_columns_clean
    +2388
    +2389                default_empty_value = "."
    +2390                query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '', COMPRESSION 'gzip'"
    +2391                include_header = True
    +2392                post_process = True
    +2393                # Export options
    +2394                if not compression_type:
    +2395                    compression_type = "bgzip"
    +2396                export_options = {
    +2397                    "format": "CSV",
    +2398                    "delimiter": delimiter,
    +2399                    "header": True,
    +2400                    "quote": None,
    +2401                    "compression": compression_type,
    +2402                }
    +2403                compresslevel = 1
    +2404
    +2405            # TSV/CSV/TBL
    +2406            elif output_type in ["tsv", "csv", "tbl"]:
    +2407                if output_type in ["csv", "tbl"]:
    +2408                    quote = '"'
    +2409                else:
    +2410                    quote = ""
    +2411                query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER, QUOTE '{quote}', COMPRESSION 'gzip'"
    +2412                if delimiter in ["\t"]:
    +2413                    include_header = header_in_output and True
    +2414                post_process = True
    +2415                if order_by_clean:
    +2416                    order_by_sql = f"ORDER BY {order_by_clean}"
    +2417                # Export options
    +2418                if not compression_type:
    +2419                    compression_type = "gzip"
    +2420                export_options = {
    +2421                    "format": "CSV",
    +2422                    "delimiter": delimiter,
    +2423                    "header": True,
    +2424                    "quote": quote,
    +2425                    "compression": compression_type,
    +2426                }
    +2427
    +2428            # JSON
    +2429            elif output_type in ["json"]:
    +2430                query_export_format = "FORMAT JSON, ARRAY TRUE"
    +2431                include_header = False
    +2432                post_process = True
    +2433                if order_by_clean:
    +2434                    order_by_sql = f"ORDER BY {order_by_clean}"
    +2435                # Export options
    +2436                if not compression_type:
    +2437                    compression_type = "gzip"
     2438                export_options = {
    -2439                    "format": "PARQUET",
    -2440                }
    -2441                include_header = False
    -2442                post_process = True
    +2439                    "format": "JSON",
    +2440                    "array": True,
    +2441                    "compression": compression_type,
    +2442                }
     2443
    -2444            # BED
    -2445            elif output_type in ["bed"]:
    -2446                query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER"
    -2447                include_header = True
    -2448                post_process = True
    -2449                if order_by_clean:
    -2450                    order_by_sql = f"ORDER BY {order_by_clean}"
    -2451                # Export options
    -2452                if not compression_type:
    -2453                    compression_type = "gzip"
    -2454                export_options = {
    -2455                    "format": "CSV",
    -2456                    "delimiter": delimiter,
    -2457                    "header": True,
    -2458                    "quote": None,
    -2459                    "compression": compression_type,
    -2460                }
    -2461
    -2462            # duckDB
    -2463            elif output_type in ["duckdb"]:
    -2464
    -2465                # Needed column
    -2466                needed_columns = []
    -2467
    -2468                # Export database as Parquet
    -2469                database_export_parquet_file = os.path.join(tmp_dir, "output.parquet")
    -2470                self.export(
    -2471                    database=database, output_database=database_export_parquet_file
    -2472                )
    -2473
    -2474                # Create database and connexion
    -2475                output_database_conn = duckdb.connect(output_database)
    -2476
    -2477                # Create table in database connexion with Parquet file
    -2478                query_copy = f""" 
    -2479                    CREATE TABLE {table}
    -2480                    AS {self.get_sql_database_link(database=database_export_parquet_file)}
    -2481                    """
    -2482                output_database_conn.execute(query_copy)
    +2444            # Parquet
    +2445            elif output_type in ["parquet"]:
    +2446                query_export_format = "FORMAT PARQUET"
    +2447                # Export options
    +2448                export_options = {
    +2449                    "format": "PARQUET",
    +2450                }
    +2451                include_header = False
    +2452                post_process = True
    +2453
    +2454            # BED
    +2455            elif output_type in ["bed"]:
    +2456                query_export_format = f"FORMAT CSV, DELIMITER '{delimiter}', HEADER"
    +2457                include_header = True
    +2458                post_process = True
    +2459                if order_by_clean:
    +2460                    order_by_sql = f"ORDER BY {order_by_clean}"
    +2461                # Export options
    +2462                if not compression_type:
    +2463                    compression_type = "gzip"
    +2464                export_options = {
    +2465                    "format": "CSV",
    +2466                    "delimiter": delimiter,
    +2467                    "header": True,
    +2468                    "quote": None,
    +2469                    "compression": compression_type,
    +2470                }
    +2471
    +2472            # duckDB
    +2473            elif output_type in ["duckdb"]:
    +2474
    +2475                # Needed column
    +2476                needed_columns = []
    +2477
    +2478                # Export database as Parquet
    +2479                database_export_parquet_file = os.path.join(tmp_dir, "output.parquet")
    +2480                self.export(
    +2481                    database=database, output_database=database_export_parquet_file
    +2482                )
     2483
    -2484                # Close connexion
    -2485                output_database_conn.close()
    +2484                # Create database and connexion
    +2485                output_database_conn = duckdb.connect(output_database)
     2486
    -2487                # remove tmp
    -2488                remove_if_exists([database_export_parquet_file])
    -2489
    -2490                return os.path.exists(output_database)
    -2491
    -2492            # Partition
    -2493            if parquet_partitions:
    -2494                parquet_partitions_clean = []
    -2495                parquet_partitions_array = []
    -2496                for parquet_partition in parquet_partitions:
    -2497                    parquet_partitions_array.append(
    -2498                        parquet_partition.translate({'"': None, "'": None, " ": None})
    -2499                    )
    -2500                    parquet_partitions_clean.append(
    -2501                        '"'
    -2502                        + parquet_partition.translate({'"': None, "'": None, " ": None})
    -2503                        + '"'
    -2504                    )
    -2505                parquet_partitions_by = ",".join(parquet_partitions_clean)
    -2506                query_export_format += (
    -2507                    f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE"
    -2508                )
    -2509                export_options["partition_by"] = parquet_partitions_array
    -2510                if export_options.get("format", None) == "CSV":
    -2511                    export_mode = "duckdb"
    -2512                    post_process_just_move = True
    -2513
    -2514            # Construct query columns
    -2515            query_columns = []
    -2516
    -2517            # Add Needed columns
    -2518            for needed_column in needed_columns:
    -2519                if needed_columns[needed_column]:
    -2520                    query_column_name = needed_columns[needed_column]
    -2521                    query_column = f""" "{needed_columns[needed_column]}" """
    -2522                else:
    -2523                    query_column_name = default_empty_value
    -2524                    query_column = f""" '{default_empty_value}' """
    -2525                query_column_as = f""" "{needed_column}" """
    -2526                if query_column_name == needed_column:
    -2527                    query_columns.append(f""" {query_column} """)
    -2528                else:
    -2529                    query_columns.append(f""" {query_column} AS {query_column_as} """)
    -2530
    -2531            # Add Extra columns
    -2532            for extra_column in extra_columns:
    -2533                if extra_column not in needed_columns:
    -2534                    # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """)
    -2535                    query_columns.append(f""" "{extra_column}" """)
    -2536
    -2537            # Query export columns
    -2538            query_export_columns = f""" {",".join(query_columns)} """
    -2539
    -2540            if query_columns:
    -2541
    -2542                # Compressed tmp file
    -2543                query_output_database_tmp = os.path.join(tmp_dir, "output")
    -2544
    -2545                # Query
    -2546                # If no query, generate query of the database
    -2547                if not query:
    -2548                    query = f"""
    -2549                        SELECT {query_export_columns}
    -2550                        FROM {self.get_sql_database_link(database=database)}
    -2551                        {order_by_sql}
    -2552                    """
    -2553
    -2554                # Test empty query
    -2555                df = self.conn.execute(query).fetch_record_batch(1)
    -2556                query_empty = True
    -2557                for d in df:
    -2558                    query_empty = False
    -2559                    break
    -2560                if query_empty:
    -2561                    log.error("Export failed: Empty")
    -2562                    raise ValueError("Export failed: Empty")
    +2487                # Create table in database connexion with Parquet file
    +2488                query_copy = f""" 
    +2489                    CREATE TABLE {table}
    +2490                    AS {self.get_sql_database_link(database=database_export_parquet_file)}
    +2491                    """
    +2492                output_database_conn.execute(query_copy)
    +2493
    +2494                # Close connexion
    +2495                output_database_conn.close()
    +2496
    +2497                # remove tmp
    +2498                remove_if_exists([database_export_parquet_file])
    +2499
    +2500                return os.path.exists(output_database)
    +2501
    +2502            # Partition
    +2503            if parquet_partitions:
    +2504                parquet_partitions_clean = []
    +2505                parquet_partitions_array = []
    +2506                for parquet_partition in parquet_partitions:
    +2507                    parquet_partitions_array.append(
    +2508                        parquet_partition.translate({'"': None, "'": None, " ": None})
    +2509                    )
    +2510                    parquet_partitions_clean.append(
    +2511                        '"'
    +2512                        + parquet_partition.translate({'"': None, "'": None, " ": None})
    +2513                        + '"'
    +2514                    )
    +2515                parquet_partitions_by = ",".join(parquet_partitions_clean)
    +2516                query_export_format += (
    +2517                    f", PARTITION_BY ({parquet_partitions_by}), OVERWRITE_OR_IGNORE"
    +2518                )
    +2519                export_options["partition_by"] = parquet_partitions_array
    +2520                if export_options.get("format", None) == "CSV":
    +2521                    export_mode = "duckdb"
    +2522                    post_process_just_move = True
    +2523
    +2524            # Construct query columns
    +2525            query_columns = []
    +2526
    +2527            # Add Needed columns
    +2528            for needed_column in needed_columns:
    +2529                if needed_columns[needed_column]:
    +2530                    query_column_name = needed_columns[needed_column]
    +2531                    query_column = f""" "{needed_columns[needed_column]}" """
    +2532                else:
    +2533                    query_column_name = default_empty_value
    +2534                    query_column = f""" '{default_empty_value}' """
    +2535                query_column_as = f""" "{needed_column}" """
    +2536                if query_column_name == needed_column:
    +2537                    query_columns.append(f""" {query_column} """)
    +2538                else:
    +2539                    query_columns.append(f""" {query_column} AS {query_column_as} """)
    +2540
    +2541            # Add Extra columns
    +2542            for extra_column in extra_columns:
    +2543                if extra_column not in needed_columns:
    +2544                    # query_columns.append(f""" "{extra_column}" AS "{extra_column}" """)
    +2545                    query_columns.append(f""" "{extra_column}" """)
    +2546
    +2547            # Query export columns
    +2548            query_export_columns = f""" {",".join(query_columns)} """
    +2549
    +2550            if query_columns:
    +2551
    +2552                # Compressed tmp file
    +2553                query_output_database_tmp = os.path.join(tmp_dir, "output")
    +2554
    +2555                # Query
    +2556                # If no query, generate query of the database
    +2557                if not query:
    +2558                    query = f"""
    +2559                        SELECT {query_export_columns}
    +2560                        FROM {self.get_sql_database_link(database=database)}
    +2561                        {order_by_sql}
    +2562                    """
     2563
    -2564                # Schema names
    -2565                schema_names = None
    -2566
    -2567                # Export mode pyarrow
    -2568                if export_mode == "pyarrow":
    -2569
    -2570                    # Compression mode
    -2571                    # If compress required and compression type as gzip or bgzip
    -2572                    # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1)
    -2573                    # to reduce tmp file size
    -2574                    compression_mode_gzip = compressed and (
    -2575                        export_options.get("compression", None) in ["gzip", "bgzip"]
    -2576                    )
    -2577
    -2578                    # File stream mode (str or bytes)
    -2579                    f_mode = ""
    -2580                    if compression_mode_gzip:
    -2581                        f_mode = "b"
    -2582
    -2583                    if include_header:
    -2584
    -2585                        # Open stream files (uncompressed and compressed)
    -2586                        with open(query_output_database_tmp, mode="w") as f, pgzip.open(
    -2587                            query_output_database_tmp,
    -2588                            mode="w",
    -2589                            thread=threads,
    -2590                            compresslevel=compresslevel,
    -2591                        ) as f_gz:
    -2592
    -2593                            # Switch to compressed stream file
    -2594                            if compression_mode_gzip:
    -2595                                f = f_gz
    +2564                # Test empty query
    +2565                df = self.conn.execute(query).fetch_record_batch(1)
    +2566                query_empty = True
    +2567                for d in df:
    +2568                    query_empty = False
    +2569                    break
    +2570                if query_empty:
    +2571                    log.warning("Export warning: Empty")
    +2572                    remove_header_line = False
    +2573                else:
    +2574                    remove_header_line = True
    +2575
    +2576                # Schema names
    +2577                schema_names = None
    +2578
    +2579                # Export mode pyarrow
    +2580                if export_mode == "pyarrow":
    +2581
    +2582                    # Compression mode
    +2583                    # If compress required and compression type as gzip or bgzip
    +2584                    # For bgzip compression, recompression will be done, and compression with gzip for tmp file done (level 1)
    +2585                    # to reduce tmp file size
    +2586                    compression_mode_gzip = compressed and (
    +2587                        export_options.get("compression", None) in ["gzip", "bgzip"]
    +2588                    )
    +2589
    +2590                    # File stream mode (str or bytes)
    +2591                    f_mode = ""
    +2592                    if compression_mode_gzip:
    +2593                        f_mode = "b"
    +2594
    +2595                    if include_header:
     2596
    -2597                            # Generate header tmp file
    -2598                            query_output_header_tmp = os.path.join(tmp_dir, "header")
    -2599                            self.get_header_file(
    -2600                                header_file=query_output_header_tmp,
    -2601                                remove_header_line=True,
    -2602                                sql_query=query,
    -2603                            )
    +2597                        # Open stream files (uncompressed and compressed)
    +2598                        with open(query_output_database_tmp, mode="w") as f, pgzip.open(
    +2599                            query_output_database_tmp,
    +2600                            mode="w",
    +2601                            thread=threads,
    +2602                            compresslevel=compresslevel,
    +2603                        ) as f_gz:
     2604
    -2605                            # Write header to tmp file
    -2606                            with open(
    -2607                                query_output_header_tmp, "r" + f_mode
    -2608                            ) as output_header_tmp:
    -2609                                f.write(output_header_tmp.read())
    -2610
    -2611                    # JSON format - Add special "[" character at the beginning of the file
    -2612                    if export_options.get("format") in ["JSON"]:
    -2613
    -2614                        # Open stream files (uncompressed and compressed)
    -2615                        with open(query_output_database_tmp, mode="a") as f, pgzip.open(
    -2616                            query_output_database_tmp,
    -2617                            mode="a",
    -2618                            thread=threads,
    -2619                            compresslevel=compresslevel,
    -2620                        ) as f_gz:
    -2621
    -2622                            # Switch to compressed stream file
    -2623                            if compression_mode_gzip:
    -2624                                f = f_gz
    -2625                                f.write(b"[\n")
    -2626                            else:
    -2627                                f.write("[\n")
    -2628
    -2629                    # Open stream files (uncompressed and compressed) for chunk
    -2630                    with open(query_output_database_tmp, mode="a") as f, pgzip.open(
    -2631                        query_output_database_tmp,
    -2632                        mode="a",
    -2633                        thread=threads,
    -2634                        compresslevel=compresslevel,
    -2635                    ) as f_gz:
    -2636
    -2637                        # Switch to compressed stream file
    -2638                        if compression_mode_gzip:
    -2639                            f = f_gz
    +2605                            # Switch to compressed stream file
    +2606                            if compression_mode_gzip:
    +2607                                f = f_gz
    +2608
    +2609                            # Generate header tmp file
    +2610                            query_output_header_tmp = os.path.join(tmp_dir, "header")
    +2611                            self.get_header_file(
    +2612                                header_file=query_output_header_tmp,
    +2613                                remove_header_line=remove_header_line,
    +2614                                sql_query=query,
    +2615                            )
    +2616
    +2617                            # Write header to tmp file
    +2618                            with open(
    +2619                                query_output_header_tmp, "r" + f_mode
    +2620                            ) as output_header_tmp:
    +2621                                f.write(output_header_tmp.read())
    +2622
    +2623                    # JSON format - Add special "[" character at the beginning of the file
    +2624                    if export_options.get("format") in ["JSON"]:
    +2625
    +2626                        # Open stream files (uncompressed and compressed)
    +2627                        with open(query_output_database_tmp, mode="a") as f, pgzip.open(
    +2628                            query_output_database_tmp,
    +2629                            mode="a",
    +2630                            thread=threads,
    +2631                            compresslevel=compresslevel,
    +2632                        ) as f_gz:
    +2633
    +2634                            # Switch to compressed stream file
    +2635                            if compression_mode_gzip:
    +2636                                f = f_gz
    +2637                                f.write(b"[\n")
    +2638                            else:
    +2639                                f.write("[\n")
     2640
    -2641                        # Chunk query with batch of dataframes of chunk_size
    -2642                        df = self.conn.execute(query).fetch_record_batch(chunk_size)
    -2643
    -2644                        # id of chunk
    -2645                        i = 0
    -2646
    -2647                        # For each chunk dataframe
    -2648                        for d in df:
    -2649
    -2650                            # Schema names
    -2651                            schema_names = d.schema.names
    +2641                    # Open stream files (uncompressed and compressed) for chunk
    +2642                    with open(query_output_database_tmp, mode="a") as f, pgzip.open(
    +2643                        query_output_database_tmp,
    +2644                        mode="a",
    +2645                        thread=threads,
    +2646                        compresslevel=compresslevel,
    +2647                    ) as f_gz:
    +2648
    +2649                        # Switch to compressed stream file
    +2650                        if compression_mode_gzip:
    +2651                            f = f_gz
     2652
    -2653                            # id of chunk
    -2654                            i += 1
    +2653                        # Chunk query with batch of dataframes of chunk_size
    +2654                        df = self.conn.execute(query).fetch_record_batch(chunk_size)
     2655
    -2656                            # Log - number of records
    -2657                            log.debug(f"Chunk {i}: records process...")
    +2656                        # id of chunk
    +2657                        i = 0
     2658
    -2659                            # Check process for first chunk
    -2660                            if i == 1:
    +2659                        # For each chunk dataframe
    +2660                        for d in df:
     2661
    -2662                                # If include header in file
    -2663                                header = export_options.get("header", True)
    +2662                            # Schema names
    +2663                            schema_names = d.schema.names
     2664
    -2665                                # Parquet output format
    -2666                                # Either a folder or a writer
    -2667                                if export_options.get("format") in ["PARQUET"]:
    -2668
    -2669                                    # For Parquet with multiple file - folder
    -2670                                    if export_options.get(
    -2671                                        "partition_by", None
    -2672                                    ) or export_options.get("per_thread_output", False):
    -2673                                        query_output_database_tmp = (
    -2674                                            f"{query_output_database_tmp}.parquet"
    -2675                                        )
    +2665                            # id of chunk
    +2666                            i += 1
    +2667
    +2668                            # Log - number of records
    +2669                            log.debug(f"Chunk {i}: records process...")
    +2670
    +2671                            # Check process for first chunk
    +2672                            if i == 1:
    +2673
    +2674                                # If include header in file
    +2675                                header = export_options.get("header", True)
     2676
    -2677                                    # For Parquet as a unique file - writer
    -2678                                    else:
    -2679                                        writer = pq.ParquetWriter(
    -2680                                            query_output_database_tmp, d.schema
    -2681                                        )
    -2682
    -2683                            else:
    -2684
    -2685                                # Switch of header in file for not first chunk
    -2686                                header = False
    -2687
    -2688                            # CSV format
    -2689                            if export_options.get("format") in ["CSV"]:
    -2690
    -2691                                # With quote option
    -2692                                if export_options.get("quote", None):
    -2693
    -2694                                    # Polars write dataframe
    -2695                                    pl.from_arrow(d).write_csv(
    -2696                                        file=f,
    -2697                                        separator=export_options.get("delimiter", ""),
    -2698                                        include_header=header,
    -2699                                        quote_char=export_options.get("quote", '"'),
    -2700                                    )
    -2701
    -2702                                # Without quote option
    -2703                                else:
    -2704
    -2705                                    # Polars write dataframe
    -2706                                    pl.from_arrow(d).write_csv(
    -2707                                        file=f,
    -2708                                        separator=export_options.get("delimiter", ""),
    -2709                                        include_header=header,
    -2710                                        quote_style="never",
    -2711                                    )
    -2712
    -2713                            # JSON format
    -2714                            elif export_options.get("format") in ["JSON"]:
    -2715
    -2716                                # Compressed mode gzip
    -2717                                if compression_mode_gzip:
    -2718
    -2719                                    # Add comma at the beginning of dataframe (if not the first one) in bytes mode
    -2720                                    if i > 1:
    -2721                                        f.write(b",\n")
    -2722
    -2723                                    # Write dataframe in bytes mode
    -2724                                    f.write(
    -2725                                        str.encode(
    -2726                                            pl.from_arrow(d)
    -2727                                            .write_ndjson()
    -2728                                            .replace("\n{", ",\n{")
    -2729                                            .replace("[", "")
    -2730                                            .replace("]", "")
    -2731                                        )
    -2732                                    )
    -2733
    -2734                                # Not compressed mode gzip (string mode)
    -2735                                else:
    -2736
    -2737                                    # Add comma at the beginning of dataframe (if not the first one) in string mode
    -2738                                    if i > 1:
    -2739                                        f.write(",\n")
    -2740
    -2741                                    # Write dataframe in string mode
    -2742                                    f.write(
    -2743                                        pl.from_arrow(d)
    -2744                                        .write_ndjson()
    -2745                                        .replace("\n{", ",\n{")
    -2746                                        .replace("[", "")
    -2747                                        .replace("]", "")
    -2748                                    )
    -2749
    -2750                            # Parquet format
    -2751                            elif export_options.get("format") in ["PARQUET"]:
    +2677                                # Parquet output format
    +2678                                # Either a folder or a writer
    +2679                                if export_options.get("format") in ["PARQUET"]:
    +2680
    +2681                                    # For Parquet with multiple file - folder
    +2682                                    if export_options.get(
    +2683                                        "partition_by", None
    +2684                                    ) or export_options.get("per_thread_output", False):
    +2685                                        query_output_database_tmp = (
    +2686                                            f"{query_output_database_tmp}.parquet"
    +2687                                        )
    +2688
    +2689                                    # For Parquet as a unique file - writer
    +2690                                    else:
    +2691                                        writer = pq.ParquetWriter(
    +2692                                            query_output_database_tmp, d.schema
    +2693                                        )
    +2694
    +2695                            else:
    +2696
    +2697                                # Switch of header in file for not first chunk
    +2698                                header = False
    +2699
    +2700                            # CSV format
    +2701                            if export_options.get("format") in ["CSV"]:
    +2702
    +2703                                # With quote option
    +2704                                if export_options.get("quote", None):
    +2705
    +2706                                    # Polars write dataframe
    +2707                                    pl.from_arrow(d).write_csv(
    +2708                                        file=f,
    +2709                                        separator=export_options.get("delimiter", ""),
    +2710                                        include_header=header,
    +2711                                        quote_char=export_options.get("quote", '"'),
    +2712                                    )
    +2713
    +2714                                # Without quote option
    +2715                                else:
    +2716
    +2717                                    # Polars write dataframe
    +2718                                    pl.from_arrow(d).write_csv(
    +2719                                        file=f,
    +2720                                        separator=export_options.get("delimiter", ""),
    +2721                                        include_header=header,
    +2722                                        quote_style="never",
    +2723                                    )
    +2724
    +2725                            # JSON format
    +2726                            elif export_options.get("format") in ["JSON"]:
    +2727
    +2728                                # Compressed mode gzip
    +2729                                if compression_mode_gzip:
    +2730
    +2731                                    # Add comma at the beginning of dataframe (if not the first one) in bytes mode
    +2732                                    if i > 1:
    +2733                                        f.write(b",\n")
    +2734
    +2735                                    # Write dataframe in bytes mode
    +2736                                    f.write(
    +2737                                        str.encode(
    +2738                                            pl.from_arrow(d)
    +2739                                            .write_ndjson()
    +2740                                            .replace("\n{", ",\n{")
    +2741                                            .replace("[", "")
    +2742                                            .replace("]", "")
    +2743                                        )
    +2744                                    )
    +2745
    +2746                                # Not compressed mode gzip (string mode)
    +2747                                else:
    +2748
    +2749                                    # Add comma at the beginning of dataframe (if not the first one) in string mode
    +2750                                    if i > 1:
    +2751                                        f.write(",\n")
     2752
    -2753                                # Partition by fields
    -2754                                partition_by = export_options.get("partition_by", None)
    -2755
    -2756                                if partition_by:
    -2757
    -2758                                    # For No partition but split parquet files into a folder
    -2759                                    if "None" in partition_by:
    -2760                                        partition_by = None
    +2753                                    # Write dataframe in string mode
    +2754                                    f.write(
    +2755                                        pl.from_arrow(d)
    +2756                                        .write_ndjson()
    +2757                                        .replace("\n{", ",\n{")
    +2758                                        .replace("[", "")
    +2759                                        .replace("]", "")
    +2760                                    )
     2761
    -2762                                    # Pyarrow write
    -2763                                    pq.write_to_dataset(
    -2764                                        pa.Table.from_batches([d]),
    -2765                                        query_output_database_tmp,
    -2766                                        partition_cols=partition_by,
    -2767                                        use_threads=threads,
    -2768                                        existing_data_behavior="overwrite_or_ignore",
    -2769                                    )
    -2770
    -2771                                # Parquet in unique file
    -2772                                else:
    -2773                                    writer.write_batch(d)
    -2774
    -2775                    # Close Parquet writer
    -2776                    if (
    -2777                        export_options.get("format") in ["PARQUET"]
    -2778                        and not export_options.get("partition_by", None)
    -2779                        and not export_options.get("per_thread_output", None)
    -2780                    ):
    -2781                        writer.close()
    +2762                            # Parquet format
    +2763                            elif export_options.get("format") in ["PARQUET"]:
    +2764
    +2765                                # Partition by fields
    +2766                                partition_by = export_options.get("partition_by", None)
    +2767
    +2768                                if partition_by:
    +2769
    +2770                                    # For No partition but split parquet files into a folder
    +2771                                    if "None" in partition_by:
    +2772                                        partition_by = None
    +2773
    +2774                                    # Pyarrow write
    +2775                                    pq.write_to_dataset(
    +2776                                        pa.Table.from_batches([d]),
    +2777                                        query_output_database_tmp,
    +2778                                        partition_cols=partition_by,
    +2779                                        use_threads=threads,
    +2780                                        existing_data_behavior="overwrite_or_ignore",
    +2781                                    )
     2782
    -2783                    # JSON format - Add special "]" character at the end of the file
    -2784                    if export_options.get("format") in ["JSON"]:
    -2785
    -2786                        # Open stream files (uncompressed and compressed)
    -2787                        with open(query_output_database_tmp, mode="a") as f, pgzip.open(
    -2788                            query_output_database_tmp,
    -2789                            mode="a",
    -2790                            thread=threads,
    -2791                            compresslevel=compresslevel,
    -2792                        ) as f_gz:
    -2793
    -2794                            # Switch to compressed stream file
    -2795                            if compression_mode_gzip:
    -2796                                f = f_gz
    -2797                                f.write(b"]\n")
    -2798                            else:
    -2799                                f.write("]\n")
    -2800
    -2801                # Export mode duckdb
    -2802                elif export_mode == "duckdb":
    -2803
    -2804                    # Create COPY TO query
    -2805                    query_copy = f""" 
    -2806                        {query_set}
    -2807                        COPY (
    -2808                            {query}
    -2809                            )
    -2810                        TO '{query_output_database_tmp}'
    -2811                        WITH ({query_export_format})
    -2812                        """
    -2813
    -2814                    # Export with duckdb
    -2815                    self.query(query=query_copy)
    -2816
    -2817                # Export mode unknown
    -2818                else:
    -2819                    log.error(f"Export mode '{export_mode}' unknown")
    -2820                    raise ValueError(f"Export mode '{export_mode}' unknown")
    -2821
    -2822                # Post process
    -2823                if post_process:
    -2824
    -2825                    # Log - number of records
    -2826                    log.debug("Post processing...")
    -2827
    -2828                    # Input files
    -2829                    input_files = []
    -2830
    -2831                    # Export mode duckdb and include header
    -2832                    if export_mode == "duckdb" and include_header:
    +2783                                # Parquet in unique file
    +2784                                else:
    +2785                                    writer.write_batch(d)
    +2786
    +2787                    # Close Parquet writer
    +2788                    if (
    +2789                        export_options.get("format") in ["PARQUET"]
    +2790                        and not export_options.get("partition_by", None)
    +2791                        and not export_options.get("per_thread_output", None)
    +2792                    ):
    +2793                        writer.close()
    +2794
    +2795                    # JSON format - Add special "]" character at the end of the file
    +2796                    if export_options.get("format") in ["JSON"]:
    +2797
    +2798                        # Open stream files (uncompressed and compressed)
    +2799                        with open(query_output_database_tmp, mode="a") as f, pgzip.open(
    +2800                            query_output_database_tmp,
    +2801                            mode="a",
    +2802                            thread=threads,
    +2803                            compresslevel=compresslevel,
    +2804                        ) as f_gz:
    +2805
    +2806                            # Switch to compressed stream file
    +2807                            if compression_mode_gzip:
    +2808                                f = f_gz
    +2809                                f.write(b"]\n")
    +2810                            else:
    +2811                                f.write("]\n")
    +2812
    +2813                # Export mode duckdb
    +2814                elif export_mode == "duckdb":
    +2815
    +2816                    # Create COPY TO query
    +2817                    query_copy = f""" 
    +2818                        {query_set}
    +2819                        COPY (
    +2820                            {query}
    +2821                            )
    +2822                        TO '{query_output_database_tmp}'
    +2823                        WITH ({query_export_format})
    +2824                        """
    +2825
    +2826                    # Export with duckdb
    +2827                    self.query(query=query_copy)
    +2828
    +2829                # Export mode unknown
    +2830                else:
    +2831                    log.error(f"Export mode '{export_mode}' unknown")
    +2832                    raise ValueError(f"Export mode '{export_mode}' unknown")
     2833
    -2834                        # create tmp header file
    -2835                        query_output_header_tmp = os.path.join(tmp_dir, "header")
    -2836                        tmp_files.append(query_output_header_tmp)
    -2837                        self.get_header_file(
    -2838                            header_file=query_output_header_tmp, remove_header_line=True
    -2839                        )
    -2840
    -2841                        # Add tmp header file for concat and compress
    -2842                        input_files.append(query_output_header_tmp)
    -2843
    -2844                    # Add variants file
    -2845                    input_files.append(query_output_database_tmp)
    -2846
    -2847                    # Output with concat and compress
    -2848                    if not post_process_just_move and (
    -2849                        export_mode == "duckdb"
    -2850                        or (
    -2851                            compressed
    -2852                            and export_options.get("compression", None) == "bgzip"
    -2853                        )
    -2854                        or sort
    -2855                        or index
    -2856                    ):
    -2857
    -2858                        # Compression type
    -2859                        if not compressed:
    -2860                            compression_type = "none"
    -2861                        else:
    -2862                            compression_type = export_options.get(
    -2863                                "compression", "bgzip"
    -2864                            )
    -2865
    -2866                        # Concat and compress
    -2867                        concat_and_compress_files(
    -2868                            input_files=input_files,
    -2869                            output_file=output_database,
    -2870                            compression_type=compression_type,
    -2871                            threads=threads,
    -2872                            sort=sort,
    -2873                            index=index,
    -2874                        )
    -2875
    -2876                    # Output already generated file (either compressed in gzip or not compressed, with included header if needed)
    -2877                    else:
    -2878
    -2879                        # Move tmp file
    -2880                        shutil.move(query_output_database_tmp, output_database)
    -2881
    -2882                # Generate associated header file
    -2883                if output_header and export_header:
    -2884
    -2885                    # Log - Generate header
    -2886                    log.debug("Generate header...")
    +2834                # Post process
    +2835                if post_process:
    +2836
    +2837                    # Log - number of records
    +2838                    log.debug("Post processing...")
    +2839
    +2840                    # Input files
    +2841                    input_files = []
    +2842
    +2843                    # Export mode duckdb and include header
    +2844                    if export_mode == "duckdb" and include_header:
    +2845
    +2846                        # create tmp header file
    +2847                        query_output_header_tmp = os.path.join(tmp_dir, "header")
    +2848                        tmp_files.append(query_output_header_tmp)
    +2849                        self.get_header_file(
    +2850                            header_file=query_output_header_tmp, remove_header_line=remove_header_line
    +2851                        )
    +2852
    +2853                        # Add tmp header file for concat and compress
    +2854                        input_files.append(query_output_header_tmp)
    +2855
    +2856                    # Add variants file
    +2857                    input_files.append(query_output_database_tmp)
    +2858
    +2859                    # Output with concat and compress
    +2860                    if not post_process_just_move and (
    +2861                        export_mode == "duckdb"
    +2862                        or (
    +2863                            compressed
    +2864                            and export_options.get("compression", None) == "bgzip"
    +2865                        )
    +2866                        or sort
    +2867                        or index
    +2868                    ):
    +2869
    +2870                        # Compression type
    +2871                        if not compressed:
    +2872                            compression_type = "none"
    +2873                        else:
    +2874                            compression_type = export_options.get(
    +2875                                "compression", "bgzip"
    +2876                            )
    +2877
    +2878                        # Concat and compress
    +2879                        concat_and_compress_files(
    +2880                            input_files=input_files,
    +2881                            output_file=output_database,
    +2882                            compression_type=compression_type,
    +2883                            threads=threads,
    +2884                            sort=sort,
    +2885                            index=index,
    +2886                        )
     2887
    -2888                    # Create database
    -2889                    database_for_header = Database(database=output_database)
    +2888                    # Output already generated file (either compressed in gzip or not compressed, with included header if needed)
    +2889                    else:
     2890
    -2891                    # Remove header if exists
    -2892                    remove_if_exists([output_header])
    +2891                        # Move tmp file
    +2892                        shutil.move(query_output_database_tmp, output_database)
     2893
    -2894                    # Find columns in database
    -2895                    if schema_names:
    -2896                        header_columns_from_database = schema_names
    -2897                    else:
    -2898                        header_columns_from_database = (
    -2899                            database_for_header.get_header_columns_from_database(
    -2900                                database=output_database
    -2901                            )
    -2902                        )
    -2903
    -2904                    # Generate header file
    -2905                    database_for_header.get_header_file(
    -2906                        header_file=output_header,
    -2907                        replace_header_line=header_columns_from_database,
    -2908                        force=True,
    -2909                    )
    -2910
    -2911            # Clean tmp files (deprecated)
    -2912            remove_if_exists(tmp_files)
    -2913
    -2914            # Return if file exists
    -2915            return os.path.exists(output_database) and self.get_type(output_database)
    +2894                # Generate associated header file
    +2895                if output_header and export_header:
    +2896
    +2897                    # Log - Generate header
    +2898                    log.debug("Generate header...")
    +2899
    +2900                    # Create database
    +2901                    database_for_header = Database(database=output_database)
    +2902
    +2903                    # Remove header if exists
    +2904                    remove_if_exists([output_header])
    +2905
    +2906                    # Find columns in database
    +2907                    if schema_names:
    +2908                        header_columns_from_database = schema_names
    +2909                    else:
    +2910                        header_columns_from_database = (
    +2911                            database_for_header.get_header_columns_from_database(
    +2912                                database=output_database
    +2913                            )
    +2914                        )
    +2915
    +2916                    # Generate header file
    +2917                    database_for_header.get_header_file(
    +2918                        header_file=output_header,
    +2919                        replace_header_line=header_columns_from_database,
    +2920                        force=True,
    +2921                    )
    +2922
    +2923            # Clean tmp files (deprecated)
    +2924            remove_if_exists(tmp_files)
    +2925
    +2926            # Return if file exists
    +2927            return os.path.exists(output_database) and self.get_type(output_database)
     
    diff --git a/docs/pdoc/howard/objects/variants.html b/docs/pdoc/howard/objects/variants.html index 83d5367..713f453 100644 --- a/docs/pdoc/howard/objects/variants.html +++ b/docs/pdoc/howard/objects/variants.html @@ -6629,7 +6629,7 @@

    6234 """ 6235 ) 6236 sql_query_annotation_to_agregate.append( - 6237 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ + 6237 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6238 ) 6239 6240 # Not to annotate @@ -18422,7 +18422,7 @@

    6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( - 6238 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ + 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate @@ -32655,7 +32655,7 @@
    Returns
    6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( -6238 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ +6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate diff --git a/docs/tips.pdf b/docs/tips.pdf index e615578..d637cc2 100644 Binary files a/docs/tips.pdf and b/docs/tips.pdf differ diff --git a/docs/user_guide.pdf b/docs/user_guide.pdf index c099f38..f730d1e 100644 Binary files a/docs/user_guide.pdf and b/docs/user_guide.pdf differ diff --git a/plugins/README.pdf b/plugins/README.pdf index 40f2849..3c604a5 100644 Binary files a/plugins/README.pdf and b/plugins/README.pdf differ