clipboard.txt

# Just a place to dump various code and info

class AtlasProjection(Resource):
    # Check the bulk data file format
    def check_format(self, expression, samples, sample_column):
        # Check number of columns in expression matrix and number of rows in sample matrix is the same
        if len(expression.columns) != len(samples):
            return {'error': 'The number of columns in the expression matrix and rows in the sample matrix need to match. Check the file formats.'}
        
        # Check the expression matrix columns are present in the row IDs in the sample matrix
        for column in expression.columns:
            if column not in samples.index:
                return {'error': 'The column headings in the expression matrix need to be present in the row IDs in the sample matrix. Check the file formats.'}
            
        # Check the expression matrix uses Ensembl IDs
        for index in expression.index:
            if index[0:4] != 'ENSG':
                return {'error': 'The expression matrix must use Ensembl IDs as row IDs. Check the file format.'}
        
        # Check the sample column given exists in the sample matrix
        if sample_column not in samples.columns and sample_column != "":
            return {'error': 'The sample column must be a column that exists in the sample matrix. Check the sample column.'}
        
        return {'error': ""}


    def post(self, atlasType, dataSource):
        """Project data onto the atlas of atlasType. dataSource is one of  ['stemformatics','user','user-single'].
        """
        try:
            # Project single-cell-data - user upload
            if dataSource.lower()=="user-single":
                parser = reqparse.RequestParser()
                parser.add_argument('email', type=str, required=False)
                parser.add_argument('data', type=werkzeug.datastructures.FileStorage, location='files')
                args = parser.parse_args()

                now = datetime.now()

                email = args.get('email')
                df = pandas.read_csv(args.get('data'), sep='\t', index_col=0)
                if len(df)==0:
                    print('error')
                    return {'error': 'The data matrix came back as zero length. Check its format.'}

                # Create unique ID
                id = uuid.uuid4()

                # Check for unique ID in ids.tsv 
                try:
                    path = "/mnt/stemformatics-data/user_projection_data/ids.csv"
                    ids_df = pandas.read_csv(path, index_col=0)
                    unique = False
                    
                    # Loop until unique ID
                    while not unique:
                        if id in ids_df.index:
                            id = uuid.uuid4()
                        else:
                            unique = True
                    
                    # Create file called ID
                    new_path = "/mnt/stemformatics-data/user_projection_data/{}".format(id)
                    if not os.path.exists(new_path):
                        os.makedirs(new_path)
                        
                    # Save df to input
                    df.to_csv(new_path + '/input.tsv', sep='\t')

                    # Append details to ids.csv
                    now = datetime.now()
                    try:
                        data = [id, email, atlasType, now.strftime('%d/%m/%Y %H:%M:%S'),'','']
                        with open(path, 'a', newline='') as fd:
                            writer_object = csv.writer(fd)
                            writer_object.writerow(data)
                            fd.close()
                    except FileNotFoundError:
                        print('File not found.')
                    except IOError:
                        print('File IO error.')
                    except Exception as e:
                        print(e)
                    
                    return
                    
                except IOError:
                    print("Error: file does not exist.")
                    return
                
            # Stemformatics or Bulk data options
            else:
                parser = reqparse.RequestParser()
                parser.add_argument('name', type=str, required=False)  # selected Stemformatics dataset name (eg. Helft_2017_28723558)
                parser.add_argument('test_name', type=str, required=False, default="test-data")  # user projected dataset name
                parser.add_argument('test_sample_column', type=str, required=False)  # user projected sample column to use for mapping
                # following returns a FileStorage object in this key
                parser.add_argument('test_expression', type=werkzeug.datastructures.FileStorage, location='files')
                parser.add_argument('test_samples', type=werkzeug.datastructures.FileStorage, location='files')
                args = parser.parse_args()


                if dataSource.lower()=="stemformatics":
                    name = args.get('name').split("_")[0] # only take the author name for this
                    # Find dataset with same name from Stemformatics
                    publicOnly = auth.AuthUser().username()==None  # public datasets only if authenticated username returns None
                    dsId = datasets.datasetIdFromName(args.get('name'), publicOnly=publicOnly)
                    ds = datasets.Dataset(dsId)
                    df = ds.expressionMatrix(key="genes" if ds.metadata()['platform_type']=='Microarray' else "raw")
                    samples = ds.samples()
                    # Select column to use - best column may not be cell_type though
                    column = 'cell_type'
                    for col in ['cell_type','sample_type','final_cell_type']:
                        if len(samples[col].unique())>=2:  # use this
                            column = col
                            break
                    samples = samples.fillna('[not assigned]')

                else:
                    name = args.get('test_name')
                    df = pandas.read_csv(args.get('test_expression'), sep='\t', index_col=0)
                    samples = pandas.read_csv(args.get('test_samples'), sep='\t', index_col=0)
                    
                    # Check validation on the uploaded data files format
                    check = self.check_format(df, samples, args.get('test_sample_column'))
                    if check["error"] != "":
                        return {"error": check["error"]}
                    
                    # Some validation on user supplied data
                    if len(df)==0:
                        return {'error': 'The expression matrix came back as zero length. Check its format.'}
                    elif len(samples)==0:
                        return {'error': 'The sample table came back as zero length. Check its format and ensure its row index match columns of expression matrix.'}
                    samples = samples.loc[df.columns]
                    column = args.get('test_sample_column')
                    
                    if column not in samples.columns: column = samples.columns[0]
                    
                # Create atlas data instance
                atlas = atlases.Atlas(atlasType)

                # Perform projection
                result = atlas.projection(name, df, includeCombinedCoords=False)
                if result["error"] !="": # Returning empty data frame may cause exception when trying to parse as json, so just return error string
                    return {"error": result["error"]}

                # Prepare the dictionary to return - each object must be JSON serializable (so don't return data frame).
                result["coords"] = result["coords"].to_dict(orient="records")
                result["samples"] = samples.reset_index().fillna('').to_dict(orient="records")
                result["sampleIds"] = ["%s_%s" % (name, item) for item in samples.index]
                result["column"] = column
                if "combinedCoords" in result:
                    result["combinedCoords"] = result["combinedCoords"].to_dict(orient="split")
                if "capybara" in result:
                    for col in result["capybara"]:
                        result["capybara"][col] = result["capybara"][col].to_dict(orient="split")

                return result
            
        except:
            raise errors.DatasetProjectionFailedError

def datasetMetadataFromQuery(**kwargs):
    """Return DataFrame of dataset metadata which match a query. Rows will have dataset ids,
    while columns will be attributes of dataset metadata. Use this instead of Dataset instance
    for fetching large numbers of datasets.

    If ids_only=True, only a list of dataset ids will be returned, instead of a DataFrame.
    Note that query_string will search samples collection as well if include_samples_query=true.

    Use datasetMetadataFromQuery() to fetch all datasets.
    """
    limit = kwargs.get("limit")
    ids_only = kwargs.get('ids_only', False)
    public_only = kwargs.get("public_only", True)
    include_samples_query = kwargs.get("include_samples_query", False)

    dataset_id = kwargs.get("dataset_id") # list of dataset ids specified in the query
    name = kwargs.get("name")
    query_string = kwargs.get("query_string")
    platform_type = kwargs.get("platform_type")
    projects = kwargs.get("projects")
    organism = kwargs.get("organism")
    status = kwargs.get("status")
    
    # params for find function (ie. fetch all records matching params) and attributes for what to return
    params = {'dataset_id': {"$nin": _exclude_list}}
    attributes = {"dataset_id":1, "_id":0} if ids_only==True else {"_id":0}

    if public_only:
        params['private'] = False

    datasetIds = []  # this is additional dataset ids to search, based on sample search
    searched = False    # need to know if datasetIds becomes empty again after performing searches
    if query_string:
        if include_samples_query:
            # perform text search in both datasets and samples and use union
            sampleSearch = database["samples"].find({'$text': {'$search':query_string}}, {'dataset_id':1})
            datasetIds = [item['dataset_id'] for item in sampleSearch]
            datasetsSearch = database["datasets"].find({'$text': {'$search':query_string}}, {'dataset_id':1})
            datasetIds = list(set(datasetIds).union(set([item['dataset_id'] for item in datasetsSearch])))
            if len(datasetIds)==0:
                params['dataset_id']["$in"] = []
        else:  # otherwise it's been done already above
            params['$text'] = {"$search": query_string}
        searched = True

    if organism and organism!='all':  # restrict datasets to samples with this organism
        sampleSearch = database["samples"].find({'organism': organism}, {'dataset_id':1})
        datasetIds = list(set(datasetIds).intersection(set([item['dataset_id'] for item in sampleSearch])))
        searched = True
    
    if dataset_id is not None and len(datasetIds)>0:  # find common dataset ids
        datasetIds = list(set(datasetIds).intersection(set([int(item) for item in dataset_id])))
    elif dataset_id is not None and len(datasetIds)==0: # just specified by parameter
        datasetIds = [int(item) for item in dataset_id]
    
    if len(datasetIds)>0:
        params['dataset_id']["$in"] = datasetIds
    else:
        if searched: # after searching we found no matching datasets
            return [] if ids_only else pandas.DataFrame()

    if platform_type: 
        # assume a list of platform types supplied if there's a comma
        params['platform_type'] = {"$in": platform_type.split(',')} if ',' in platform_type else platform_type

    if projects:
        if projects=='atlas':  # any atlas project
            params["projects"] = {"$in":["%s_atlas" % atlasType for atlasType in Atlas.all_atlas_types]}
        else:
            params["projects"] = {"$in":[projects]}
    if status:
        params["status"] = status
    if name:
        params["name"] = name
    
    if limit:
        cursor = database["datasets"].find(params, attributes).limit(limit)
    else:
        cursor = database["datasets"].find(params, attributes)

    if ids_only:
        return [item["dataset_id"] for item in cursor]
    else:
        return pandas.DataFrame(cursor).set_index("dataset_id") if cursor.count()!=0 else pandas.DataFrame()