Skip to content

tools._mapping


def bulk_mapping(frac_data,
                bulk_adata,
                sc_adata,
                n_cell=100,
                annotation_key="curated_cell_type",
                bulk_layer=None,
                sc_layer=None,
                reorder=True,
                multiprocessing=True,
                cpu_num=cpu_count()-2,
                dataset_name="",
                out_dir=".",
                normalization=True,
                filter_gene=True,
                cut_off_value=0.6,
                save=True)

Reconstruct bulk data using single-cell data and cell type fractions.

This function maps bulk expression data to single-cell expression data using cell type fraction information and various preprocessing steps.

Parameters:

Name Type Description Default
bulk_adata AnnData

An :class:~anndata.AnnData object containing the input bulk data.

required
sc_adata AnnData

An :class:~anndata.AnnData object containing the single-cell expression data.

required
n_cell int

Number of cells per bulk sample.

100
annotation_key string

Key in sc_adata.obs for single-cell annotations.

'curated_cell_type'
bulk_layer string

Layer in bulk_adata to use for bulk expression data.

None
sc_layer string

Layer in sc_adata to use for single-cell expression data.

None
reorder bool, optional (default: True)

Whether to reorder genes to ensure consistency between bulk and single-cell data.

True
multiprocessing bool, optional (default: True)

Whether to use multiprocessing for efficiency.

True
cpu_num int

Number of CPUs to use if multiprocessing is enabled.

cpu_count() - 4
project string

Prefix for output files.

''
out_dir string

Directory to store output files.

'.'
normalization bool, optional (default: True)

Whether to apply CPM normalization to data.

True
filter_gene bool, optional (default: True)

Whether to filter genes based on cosine similarity.

True
cut_off_value float, optional (default: 0.6)

Threshold for cosine similarity when filtering genes.

required
save bool, optional (default: True)

Whether to save the result files.

True

Returns:

Name Type Description
bulk_adata AnnData

The processed bulk data with mapping results.

df DataFrame

DataFrame containing the mapping of bulk samples to single-cell IDs.

Source code in cytobulk\tools\_mapping.py
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def bulk_mapping(bulk_adata,
                sc_adata,
                n_cell=100,
                annotation_key="curated_cell_type",
                bulk_layer=None,
                sc_layer=None,
                reorder=True,
                multiprocessing=True,
                cpu_num=cpu_count()-4,
                project="",
                out_dir=".",
                normalization=True,
                filter_gene=True,
                save=True):
    """
    Reconstruct bulk data using single-cell data and cell type fractions.

    This function maps bulk expression data to single-cell expression data using
    cell type fraction information and various preprocessing steps.

    Parameters
    ----------
    bulk_adata : anndata.AnnData
        An :class:`~anndata.AnnData` object containing the input bulk data.

    sc_adata : anndata.AnnData
        An :class:`~anndata.AnnData` object containing the single-cell expression data.

    n_cell : int, optional
        Number of cells per bulk sample.

    annotation_key : string, optional
        Key in `sc_adata.obs` for single-cell annotations.

    bulk_layer : string, optional
        Layer in `bulk_adata` to use for bulk expression data.

    sc_layer : string, optional
        Layer in `sc_adata` to use for single-cell expression data.

    reorder : bool, optional (default: True)
        Whether to reorder genes to ensure consistency between bulk and single-cell data.

    multiprocessing : bool, optional (default: True)
        Whether to use multiprocessing for efficiency.

    cpu_num : int, optional
        Number of CPUs to use if multiprocessing is enabled.

    project : string, optional
        Prefix for output files.

    out_dir : string, optional
        Directory to store output files.

    normalization : bool, optional (default: True)
        Whether to apply CPM normalization to data.

    filter_gene : bool, optional (default: True)
        Whether to filter genes based on cosine similarity.

    cut_off_value : float, optional (default: 0.6)
        Threshold for cosine similarity when filtering genes.

    save : bool, optional (default: True)
        Whether to save the result files.

    Returns
    -------
    bulk_adata : anndata.AnnData
        The processed bulk data with mapping results.

    df : pandas.DataFrame
        DataFrame containing the mapping of bulk samples to single-cell IDs.
    """

    start_t = time.perf_counter()
    print("=================================================================================================")
    print('Start to mapping bulk data with single cell dataset.')
    # format data
    bulk_adata.var_names_make_unique()
    sc_adata.var_names_make_unique()
    intersect_gene = bulk_adata.var_names.intersection(sc_adata.var_names)
    bulk_adata = bulk_adata[:,intersect_gene]
    sc_adata = sc_adata[:,intersect_gene]
    cell_prop = bulk_adata.uns['deconv']
    cell_matrix = np.floor(n_cell * cell_prop)
    cell_num = cell_matrix.astype(int)
    meta_data = sc_adata.obs[[annotation_key]]
    meta_dict = meta_data.groupby(meta_data[annotation_key]).groups
    for key, value in meta_dict.items():
        meta_dict[key] = np.array(value)
    cellname_list=cell_prop.columns
    cell_list = np.array(sc_adata.obs_names)
    #normalization
    bulk_adata.layers['mapping_ori'] = bulk_adata.X.copy()
    if normalization:
        sc_adata=utils.normalization_cpm(sc_adata,scale_factors=100000,trans_method="log")
        bulk_adata=utils.normalization_cpm(bulk_adata,scale_factors=100000,trans_method="log")
    bulk_adata.layers['mapping_nor'] = bulk_adata.X.copy()
    input_sc_data = get.count_data(sc_adata,counts_location=sc_layer)
    bulk_data = get.count_data(bulk_adata,counts_location=bulk_layer)

    if reorder:
        intersect_gene = input_sc_data.index.intersection(bulk_data.index)
        input_sc_data = input_sc_data.loc[intersect_gene,:]
        bulk_data = bulk_data.loc[intersect_gene,:]

    sc_data = utils.normal_center(input_sc_data)
    bulk_data = utils.normal_center(bulk_data)
    bulk_adata.layers['normal_center'] = bulk_data.T.values
    sample = np.zeros((cell_prop.shape[0],sc_data.shape[0]))
    mapped_cor = []
    sample_ori = np.zeros((cell_prop.shape[0],sc_data.shape[0]))
    sc_mapping_dict = dict([(k,[]) for k in bulk_data.columns])
    if multiprocessing:
        if cpu_count()<2:
            cpu_num = cpu_count()
        # compute person correlation and select sc according to person correlation.
        print(f"multiprocessing mode, cpu count is {cpu_num}")
        with Pool(int(cpu_num)) as p:
            results = p.starmap(_bulk_mapping_parallel, [(i, cell_num.iloc[i,:], bulk_data, sc_data, cell_list, meta_dict, cellname_list, input_sc_data)
             for i in range(cell_num.shape[0])])
        # postprocessing
        for i, (sample_ori_i,sample_i, mapped_cor_i, sc_mapping_dict_i) in enumerate(results):
            sample_ori[i]= np.array(sample_ori_i)
            sample[i] = np.array(sample_i)
            mapped_cor.append(mapped_cor_i)
            for k in sc_mapping_dict_i.keys():
                sc_mapping_dict[k].extend(sc_mapping_dict_i[k])
    else:
        for index_num, (i, sample_num) in enumerate(tqdm(cell_num.iterrows())):
            sample_cor = np.dot(bulk_data[i].values.reshape(1,bulk_data.shape[0]),sc_data.values)
            cor_index = cell_list[np.argsort(sample_cor)]
            for j, cellname in enumerate(cellname_list):
                mask = np.isin(cor_index, meta_dict[cellname])
                sub_cell = cor_index[mask]
                sub_cell = sub_cell[:int(sample_num[j])]
                sc_mapping_dict[i].extend(sub_cell)
            print(f"sample {i} done.")
            sample_ori[index_num,:] = input_sc_data.loc[:,sc_mapping_dict[i]].sum(axis=1)
            sample[index_num,:] = sc_data.loc[:,sc_mapping_dict[i]].sum(axis=1)
            mapped_cor_i = utils.pear(sample[index_num,:],bulk_data[i].values).item()
            mapped_cor.append(mapped_cor_i)
    print('initial mapping solution:',"min correlation", min(mapped_cor),"average correlation",np.mean(mapped_cor),"max correlation", max(mapped_cor))
    bulk_adata.obsm['cell_number']=pd.DataFrame(cell_matrix,index=cell_prop.index,columns=cell_prop.columns)
    bulk_adata.layers['mapping'] = sample/n_cell
    bulk_adata.layers['mapping_ori'] = sample_ori/n_cell
    if filter_gene:
        from sklearn.metrics.pairwise import cosine_similarity
        gene_list = []
        similarity_list=[]
        data_ori = pd.DataFrame(bulk_adata.X,index=bulk_adata.obs_names,columns=bulk_adata.var_names)
        data_mapping = pd.DataFrame(bulk_adata.layers['mapping_ori'],index=bulk_adata.obs_names,columns=bulk_adata.var_names)
        for gene in bulk_adata.var_names:
            similarity = cosine_similarity(data_ori[gene].values.reshape(1, -1), data_mapping[gene].values.reshape(1, -1))
            if similarity > 0.6:
                similarity_list.append(similarity[0][0])
                gene_list.append(gene)
        bulk_adata=bulk_adata[:,gene_list].copy()
        print('Gene cosin similarity:',"min value", min(similarity_list),"average value",np.mean(similarity_list),"max value", max(similarity_list))
        print(f'The number of reconstructed gene:{len(gene_list)}')  
    print(f'Time to finish mapping: {round(time.perf_counter() - start_t, 2)} seconds')
    print("=========================================================================================================================================")
    if save:
        out_dir = utils.check_paths(f'{out_dir}/output')
        df = pd.DataFrame([(k, v) for k, lst in sc_mapping_dict.items() for v in lst], columns=['sample_id', 'cell_id'])
        df.to_csv(f"{out_dir}/bulk_data_{project}_mapping.csv")
        bulk_adata.write_h5ad(f"{out_dir}/bulk_data_{project}_mapping.h5ad")

    return df,bulk_adata

def st_mapping(st_adata,
               sc_adata,
               out_dir,
               project,
               annotation_key,
               **kwargs)

Run spatial transcriptomics mapping with single-cell RNA-seq data.

This function maps spatial transcriptomics (ST) data to single-cell RNA-seq (scRNA-seq) data. It aligns cell type compositions and estimates spatial distributions.

Parameters:

Name Type Description Default
st_adata AnnData

An :class:~anndata.AnnData object containing spatial transcriptomics data.

required
sc_adata AnnData

An :class:~anndata.AnnData object containing single-cell RNA-seq data.

required
seed int, optional (default: 0)

Seed for random number generation to ensure reproducibility.

required
annotation_key string, optional (default: 'celltype_minor')

Key in sc_adata for cell type annotations.

required
sc_downsample bool, optional (default: False)

Whether to downsample scRNA-seq data to a maximum number of transcripts per cell.

required
scRNA_max_transcripts_per_cell int, optional (default: 1500)

Maximum number of transcripts per cell for downsampling.

required
sampling_method string, optional (default: 'duplicates')

Method for sampling single cells based on cell type composition.

required
out_dir string, optional (default: '.')

Directory to save output files.

required
project string, optional (default: 'test')

Project name for output file naming.

required
mean_cell_numbers int, optional (default: 8)

Average number of cells per spot used for estimation.

required
save_reconstructed_st bool, optional (default: True)

Whether to save the reconstructed spatial transcriptomics data.

required

Returns:

Name Type Description
reconstructed_sc DataFrame

DataFrame containing the mapping of single-cell IDs to spatial spot IDs.

Source code in cytobulk\tools\_mapping.py
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
def st_mapping(st_adata,
               sc_adata,
               out_dir,
               project,
               annotation_key,
               **kwargs):
    """
    Run spatial transcriptomics mapping with single-cell RNA-seq data.

    This function maps spatial transcriptomics (ST) data to single-cell RNA-seq (scRNA-seq) data. It aligns cell type compositions and estimates spatial distributions.

    Parameters
    ----------
    st_adata : anndata.AnnData
        An :class:`~anndata.AnnData` object containing spatial transcriptomics data.

    sc_adata : anndata.AnnData
        An :class:`~anndata.AnnData` object containing single-cell RNA-seq data.

    seed : int, optional (default: 0)
        Seed for random number generation to ensure reproducibility.

    annotation_key : string, optional (default: 'celltype_minor')
        Key in `sc_adata` for cell type annotations.

    sc_downsample : bool, optional (default: False)
        Whether to downsample scRNA-seq data to a maximum number of transcripts per cell.

    scRNA_max_transcripts_per_cell : int, optional (default: 1500)
        Maximum number of transcripts per cell for downsampling.

    sampling_method : string, optional (default: 'duplicates')
        Method for sampling single cells based on cell type composition.

    out_dir : string, optional (default: '.')
        Directory to save output files.

    project : string, optional (default: 'test')
        Project name for output file naming.

    mean_cell_numbers : int, optional (default: 8)
        Average number of cells per spot used for estimation.

    save_reconstructed_st : bool, optional (default: True)
        Whether to save the reconstructed spatial transcriptomics data.

    Returns
    -------
    reconstructed_sc : pandas.DataFrame
        DataFrame containing the mapping of single-cell IDs to spatial spot IDs.
    """
    start_t = time.perf_counter()
    print("=================================================================================================")
    print('Start to mapping bulk data with single cell dataset.')
    reconstructed_sc=_run_st_mapping(st_adata = st_adata,
                                     sc_adata = sc_adata,
                                     out_dir = out_dir,
                                     project = project,
                                     annotation_key=annotation_key,
                                     **kwargs)
    print(f'Time to finish mapping: {round(time.perf_counter() - start_t, 2)} seconds')
    print("=========================================================================================================================================")

    return reconstructed_sc

def he_mapping(image_dir,
               out_dir,
               project,
               lr_data = None,
               sc_adata = None,
               annotation_key="curated_celltype",
               k_neighbor=30,
               alpha=0.5,
               mapping_sc=True,
               **kwargs)

Run H&E-stained image cell type mapping with single-cell RNA-seq data.

This function predicts cell types from H&E-stained histology images and aligns them with single-cell RNA-seq (scRNA-seq) data using optimal transport. It computes spatial distributions and matches cell types between the image and single-cell data.

Parameters:

Name Type Description Default
image_dir str

Path to the directory containing H&E-stained images.

required
out_dir str

Directory where the output files will be saved.

required
project str

Name of the project, used for naming output files.

required
lr_data pandas.DataFrame, optional (default: None)

A DataFrame containing ligand-receptor pair data with columns 'ligand' and 'receptor'.

None
sc_adata anndata.AnnData, optional (default: None)

An :class:~anndata.AnnData object containing single-cell RNA-seq data with gene expression profiles.

None
annotation_key str, optional (default: "curated_celltype")

Key in sc_adata.obs for cell type annotations.

'curated_celltype'
k_neighbor int, optional (default: 30)

Number of neighbors to consider when constructing the graph for H&E image data.

30
alpha float, optional (default: 0.5)

Trade-off parameter for the Fused Gromov-Wasserstein optimal transport, controlling the balance between graph structure and feature matching (value between 0 and 1).

0.5
mapping_sc bool, optional (default: True)

Whether to perform mapping between H&E image cell data and single-cell RNA-seq data. If False, only H&E image cell type predictions are returned.

True
**kwargs dict

Additional arguments (not used in this implementation).

{}

Returns:

Name Type Description
cell_coordinates DataFrame

DataFrame containing cell coordinates and their predicted cell types from H&E-stained images.

df DataFrame

DataFrame containing matching results between H&E image cells and single-cell data, including spatial coordinates, cell types, and matched single-cell IDs.

filtered_adata AnnData

A filtered :class:~anndata.AnnData object containing only the single-cell data that matches with cells from H&E-stained images.

Source code in cytobulk\tools\_mapping.py
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
def he_mapping(image_dir,
               out_dir,
               project,
               lr_data = None,
               sc_adata = None,
               annotation_key="curated_celltype",
               k_neighbor=30,
               alpha=0.5,
               mapping_sc=True,
               **kwargs):

    """
    Run H&E-stained image cell type mapping with single-cell RNA-seq data.

    This function predicts cell types from H&E-stained histology images and aligns them with single-cell RNA-seq (scRNA-seq) data using optimal transport. It computes spatial distributions and matches cell types between the image and single-cell data.

    Parameters
    ----------
    image_dir : str
        Path to the directory containing H&E-stained images.

    out_dir : str
        Directory where the output files will be saved.

    project : str
        Name of the project, used for naming output files.

    lr_data : pandas.DataFrame, optional (default: None)
        A DataFrame containing ligand-receptor pair data with columns 'ligand' and 'receptor'.

    sc_adata : anndata.AnnData, optional (default: None)
        An :class:`~anndata.AnnData` object containing single-cell RNA-seq data with gene expression profiles.

    annotation_key : str, optional (default: "curated_celltype")
        Key in `sc_adata.obs` for cell type annotations.

    k_neighbor : int, optional (default: 30)
        Number of neighbors to consider when constructing the graph for H&E image data.

    alpha : float, optional (default: 0.5)
        Trade-off parameter for the Fused Gromov-Wasserstein optimal transport, controlling the balance between graph structure and feature matching (value between 0 and 1).

    mapping_sc : bool, optional (default: True)
        Whether to perform mapping between H&E image cell data and single-cell RNA-seq data. If False, only H&E image cell type predictions are returned.

    **kwargs : dict
        Additional arguments (not used in this implementation).

    Returns
    -------
    cell_coordinates : pandas.DataFrame
        DataFrame containing cell coordinates and their predicted cell types from H&E-stained images.

    df : pandas.DataFrame
        DataFrame containing matching results between H&E image cells and single-cell data, including spatial coordinates, cell types, and matched single-cell IDs.

    filtered_adata : anndata.AnnData
        A filtered :class:`~anndata.AnnData` object containing only the single-cell data that matches with cells from H&E-stained images.
    """
    start_t = time.perf_counter()
    file_dir = resource_filename(__name__, 'model/pretrained_models/')
    file_name = 'DeepCMorph_Datasets_Combined_41_classes_acc_8159.pth'

    # The download URL for the file (replace with the actual URL)
    download_url = "https://data.vision.ee.ethz.ch/ihnatova/public/DeepCMorph/DeepCMorph_Pan_Cancer_Regularized_32_classes_acc_8200.pth"

    # Ensure the file exists; if not, download it
    get.ensure_file_exists(file_dir, file_name, download_url)
    cell_coordinates = inference_cell_type_from_he_image(image_dir,
                                                         out_dir,
                                                         project)
    if mapping_sc:
        print("preprocessing of single cell data")
        lr_genes = np.unique(np.concatenate((lr_data['ligand'].values, lr_data['receptor'].values)))
        sc.pp.filter_cells(sc_adata, min_genes=200)  # filter
        sc.pp.filter_genes(sc_adata, min_cells=3)   # filter
        sc.pp.normalize_total(sc_adata, target_sum=1e4)  # nor
        sc.pp.log1p(sc_adata)  # log
        common_gene = np.intersect1d(lr_genes, sc_adata.var_names)
        sc_adata = sc_adata[:, common_gene].copy()
        lr_data = lr_data[
            (lr_data['ligand'].isin(common_gene)) & (lr_data['receptor'].isin(common_gene))
        ].copy()

        adata_cell_types = set(sc_adata.obs[annotation_key].unique())
        coordinates_cell_types = set(cell_coordinates["cell_type"].unique())


        # common_celltype
        common_cell_types = adata_cell_types.intersection(coordinates_cell_types)
        print(f"Common cell types: {common_cell_types}")

        # filter adata and cell_coordinates
        sc_adata = sc_adata[sc_adata.obs[annotation_key].isin(common_cell_types), :].copy()
        cell_coordinates = cell_coordinates[cell_coordinates["cell_type"].isin(common_cell_types)].copy()


        print("loading graph for H&E image...")
        graph1_adj, graph1_labels = load_graph1(cell_coordinates,k=k_neighbor)

        print("loading graph for single cell data with LR affinity...")
        graph2_dist, graph2_labels,sc_adata = load_graph2_with_LR_affinity(sc_adata, 
                                                                graph1_labels,
                                                                lr_data,
                                                                annotation_key)

        graph2_dist = np.nan_to_num(graph2_dist, nan=np.nanmax(graph2_dist), posinf=np.nanmax(graph2_dist), neginf=0)

        print("compute cost matrix")
        cost_matrix = construct_cost_matrix(graph1_labels, graph2_labels)

        cost_matrix = np.nan_to_num(cost_matrix, nan=np.nanmax(cost_matrix), posinf=np.nanmax(cost_matrix), neginf=0)

        print("optimal transport...")
        p = np.ones(graph1_adj.shape[0]) / graph1_adj.shape[0]
        q = np.ones(graph2_dist.shape[0]) / graph2_dist.shape[0]

        p = np.nan_to_num(p, nan=1.0 / len(p), posinf=1.0 / len(p), neginf=0)
        q = np.nan_to_num(q, nan=1.0 / len(q), posinf=1.0 / len(q), neginf=0)

        # Fused Gromov-Wasserstein
        '''
        gw_trans, log = ot.gromov.BAPG_fused_gromov_wasserstein(
            cost_matrix, graph1_adj, graph2_dist, p, q, alpha=alpha, log=True
        )
        '''
        ot_plan = ot.gromov.fused_gromov_wasserstein(
        cost_matrix, graph1_adj, graph2_dist, p, q, alpha=alpha, loss_fun='square_loss'
        )
        print(f'Time to finish mapping: {round(time.perf_counter() - start_t, 2)} seconds')
        print("=========================================================================================================================================")
        # Step 5: matching file
        print("build matching file...")
        locations = list(range(graph1_adj.shape[0])) 
        cells = list(range(graph2_dist.shape[0])) 
        matches = extract_matching_relationships(ot_plan, locations, cells)
        df = pd.DataFrame(matches, columns=["location", "cell"])
        location_mapping = cell_coordinates[["x", "y"]].to_dict(orient="index")
        df["x"] = df["location"].map(lambda c: cell_coordinates['x'].values[c])
        df["y"] = df["location"].map(lambda c: cell_coordinates['y'].values[c])
        df["he_cell_type"] = df["location"].map(lambda c: cell_coordinates['cell_type'].values[c])
        df["cell_type"] = df["cell"].map(lambda c: graph2_labels[c])
        df["cell_id"] = df["cell"].map(lambda c: sc_adata.obs_names[c])
        df.to_csv(f"{out_dir}/{project}_matching_results.csv", index=False)
        df_cell_ids = df['cell_id']
        adata_cell_ids = sc_adata.obs_names 
        common_cell_ids = set(df_cell_ids).intersection(set(adata_cell_ids))
        filtered_df = df[df['cell_id'].isin(common_cell_ids)].copy()
        filtered_adata = sc_adata[sc_adata.obs_names.isin(common_cell_ids)].copy()
        filtered_df = filtered_df.set_index('cell_id')
        #filtered_adata.obs = pd.DataFrame()
        filtered_adata.obs = filtered_df
        filtered_adata.var.index.name = "gene"
        filtered_adata.write_h5ad(f"{out_dir}/{project}_matching_adata.h5ad")
        return cell_coordinates,df,filtered_adata
    else:
        return cell_coordinates