-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfig.py
110 lines (88 loc) · 6.01 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
## Setting the paths
PROJECT_NAME = 'sample_pipeline'
PROJECT_DIRECTORY = '../../pipeline' # within this folder, the pipeline folder structure will be created
TOMO_DIR = './toy_data/tomograms' # path of directory containing the data (Tomo1, Tomo2, ...)
## General project settings
PIXEL_SPACING_BIN1 = 13.68
UNBINNED_OFFSET_Z = 3174. # This should normally be 0. For toy data, it is 3174.
TOMO_BINNING = 4
USE_GPU = True
PICK_ON_BOTH_SIDES = True # This should be True, if you are working on a cluster without the possibility to use
# graphical user interfaces (from matplotlib)
TEST_ALL = False # Should all membranes be assigned to the test set?
GT_AVAIL = True # Is ground truth data available?
## Protein details
MEMBRANORAMA_XMLS = True # This has no usage in current version (tuning)
USE_GT_ORIENTATIONS = True
# Protein tokens:
# For each protein you want to detect, specify a dictionary entry. The list of names corresponds to possible namings in the membranorama
# file. E.g. 'PSII': ['PSII', 'PS2'] means that for protein 'PSII', both 'PSII' and 'PS2' tokens are accepted.
PROT_TOKENS = {'PSII': ['PSII', 'PS2'],
'PSI': ['PSI_', 'PS1'],
'b6f': ['b6f', 'bf6'],
'UK': ['UK', 'unknown']}
# If you want to use the particle shapes for training, please specify the paths to the structures used to map into
# the membranorama views for generating the ground truth.
# In case you have clikced a particle without specific shape, you can also use the string "sphereX" where X corresponds to
# the desired sphere radius.
PSII_PARTICLE = './toy_data/structures/Chlamy_C2_14A.mrc'
B6F_PARTICLE = './toy_data/structures/Cyt b6f_14A_center.mrc'
UK_PARTICLE = 'sphere12' # "sphereX" tells the program to use a sphere with radius X voxels as a particle model
PROT_SHAPES = {'PSII': PSII_PARTICLE, 'b6f': B6F_PARTICLE, 'UK': UK_PARTICLE} # keys should correspond to keys of PROT_TOKENS
## Efficiency details
N_PR_NORMALVOTING = 4 # number of processes used for normal voting
N_PR_ROTATION = 1 # number of processes used for rotating subvolumes; only recommended for small subvolumes and many sampled points --> mostly 1 is enough!
## Preprocessing details
USE_ROTATION_NORMALIZATION = True # experimental & might not work if set to False!
ROTATION_AUGMENTATION_DURING_TRAINING = False # This enables the rotational augmentation around random angles for all axes
# If deactivated, still rotational augmentation around z-axis is performed
BOX_RANGE = 6 # size of subvolumes sampled (cube of length BOX_RANGE*2)
LP_CUTOFF = None # cutoff value for low-pass filtering of tomogram before extracting subvolumes
# (can increase generalizability, but takes some time)
# If LP_CUTOFF = None, no low-pass filtering is performed. Otherwise should be in the range 0.0 - 0.25
## Training settings
# Specify the tokens used for training, validation and test sets.
# Should be a dictionary with tomogram tokens corresponding to keys. For each key, specify a list, where each entry of the
# list specifies a certain membrane via (stack token, membrane token)
# If tokens are set to NONE, splits are automatically generated using the splits (70, 15, 15)
# CAUTION: This may lead to different training results and biases, as at least the test set should be fixed.
TRAIN_TOKENS = {'Tomo1': [('S1', 'M1A'), ('S1', 'M1B')]}
VAL_TOKENS = {'Tomo1': [('S2', 'M15A'), ('S2', 'M15B')]}
TEST_TOKENS = {'Tomo1': [('S4', 'M8A'), ('S4', 'M8B')]}
if TEST_ALL:
TRAIN_TOKENS = None
VAL_TOKENS = None
TEST_TOKENS = None
# Specify which distances should be used for training. Should be a list with entries:
# - either a protein token --> distances to only this protein class are computed
# - a list of protein tokens --> minimal distances to any of the protein tokens in the list are computed
# For each entry in the list, the network generates a separate output, so you can have multiple heatmaps for multiple
# particle classes.
TRAINING_PARTICLE_DISTS = [['PSII', 'UK'], 'b6f']
TRAINING_PARTICLE_DISTS = [['PSII', 'UK']]
LOG_CLUSTERING_STATS = True # Flag whether or not to log clustering statistics during training
# Currently only works for single output heatmaps (i.e. 1 predicted protein class)
# If more than one cluster bandwidths are specified, only the first will be used
LOG_CLUSTERING_EVERY_NTH_EPOCH = 10 # Every nth epoch, clustering will be performed to validate training (only applies if
# LOG_CLUSTERING_STATS is True
BATCH_SIZE = 512
MAX_EPOCHS = 10
MAX_PARTICLE_DISTANCE = 7. # all distances above this value will be capped
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 1e-3
LOSS_FN = 'MSE' # 'Huber', 'L1' ## Choose between MSE, Huber or L1 loss
RUN_TOKEN = None
## Clustering settings
CLUSTER_BANDWIDTHS = [18, 23, 28] # Multiple values can be provided, if F1 scores for different bandwidth should be
# calculated and compared
RECLUSTER_FLAG = True # Should the clusters be re-clustered if they are too large?
RECLUSTER_THRES = [78, 78, 78] # If a cluster has a diameters longer than these values, they will be reclustered with
# a smaller bandwidth. Needs to be of the same shape es CLUSTER_BANDWIDTHS
RECLUSTER_BANDWIDTHS = [13, 18, 24] # Bandwidths for reclustering. Needs to be of the same shape es CLUSTER_BANDWIDTHS
## Evaluation settings
PROT_TOKENS_PRED = ['PSII', 'UK'] # Tokens of proteins that can be considered GT for predicted positions (if pred. position close to one of them, it is a hit)
PROT_TOKENS_GT = ['PSII'] # Tokens for proteins that should be hit. If a predicted position is close to them, this GT position is hit
## Expert settings
MAX_DIST_FROM_MEMBRANE = 15 # maximum distance for points sampled from membrane segmentation. Higher value increases
# robustness of normals, but increases computing efforts.
SHRINK_THRES = 118 #118