- feature_extractor/preprocessor_config.json +28 -0
- model_index.json +33 -0
- safety_checker/config.json +28 -0
- safety_checker/model.fp16.safetensors +3 -0
- safety_checker/model.safetensors +3 -0
- scheduler/scheduler_config.json +19 -0
- text_encoder/config.json +24 -0
- text_encoder/model.fp16.safetensors +3 -0
- text_encoder/model.safetensors +3 -0
- tokenizer/added_tokens.json +4 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +6 -0
- tokenizer/tokenizer_config.json +32 -0
- tokenizer/vocab.json +0 -0
- unet/config.json +66 -0
- unet/diffusion_pytorch_model.fp16.safetensors +3 -0
- unet/diffusion_pytorch_model.safetensors +3 -0
- vae/config.json +31 -0
- vae/diffusion_pytorch_model.fp16.safetensors +3 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,28 @@ | |
1 | + { |
2 | + "crop_size": { |
3 | + "height": 224, |
4 | + "width": 224 |
5 | + }, |
6 | + "do_center_crop": true, |
7 | + "do_convert_rgb": true, |
8 | + "do_normalize": true, |
9 | + "do_rescale": true, |
10 | + "do_resize": true, |
11 | + "feature_extractor_type": "CLIPFeatureExtractor", |
12 | + "image_mean": [ |
13 | + 0.48145466, |
14 | + 0.4578275, |
15 | + 0.40821073 |
16 | + ], |
17 | + "image_processor_type": "CLIPFeatureExtractor", |
18 | + "image_std": [ |
19 | + 0.26862954, |
20 | + 0.26130258, |
21 | + 0.27577711 |
22 | + ], |
23 | + "resample": 3, |
24 | + "rescale_factor": 0.00392156862745098, |
25 | + "size": { |
26 | + "shortest_edge": 224 |
27 | + } |
28 | + } |
model_index.json ADDED
@@ -0,0 +1,33 @@ | |
1 | + { |
2 | + "_class_name": "StableDiffusionPipeline", |
3 | + "_diffusers_version": "0.22.0.dev0", |
4 | + "feature_extractor": [ |
5 | + "transformers", |
6 | + "CLIPFeatureExtractor" |
7 | + ], |
8 | + "requires_safety_checker": true, |
9 | + "safety_checker": [ |
10 | + "stable_diffusion", |
11 | + "StableDiffusionSafetyChecker" |
12 | + ], |
13 | + "scheduler": [ |
14 | + "diffusers", |
15 | + "DDIMScheduler" |
16 | + ], |
17 | + "text_encoder": [ |
18 | + "transformers", |
19 | + "CLIPTextModel" |
20 | + ], |
21 | + "tokenizer": [ |
22 | + "transformers", |
23 | + "CLIPTokenizer" |
24 | + ], |
25 | + "unet": [ |
26 | + "diffusers", |
27 | + "UNet2DConditionModel" |
28 | + ], |
29 | + "vae": [ |
30 | + "diffusers", |
31 | + "AutoencoderKL" |
32 | + ] |
33 | + } |
safety_checker/config.json ADDED
@@ -0,0 +1,28 @@ | |
1 | + { |
2 | + "_name_or_path": "CompVis/stable-diffusion-safety-checker", |
3 | + "architectures": [ |
4 | + "StableDiffusionSafetyChecker" |
5 | + ], |
6 | + "initializer_factor": 1.0, |
7 | + "logit_scale_init_value": 2.6592, |
8 | + "model_type": "clip", |
9 | + "projection_dim": 768, |
10 | + "text_config": { |
11 | + "dropout": 0.0, |
12 | + "hidden_size": 768, |
13 | + "intermediate_size": 3072, |
14 | + "model_type": "clip_text_model", |
15 | + "num_attention_heads": 12 |
16 | + }, |
17 | + "torch_dtype": "float16", |
18 | + "transformers_version": "4.34.0", |
19 | + "vision_config": { |
20 | + "dropout": 0.0, |
21 | + "hidden_size": 1024, |
22 | + "intermediate_size": 4096, |
23 | + "model_type": "clip_vision_model", |
24 | + "num_attention_heads": 16, |
25 | + "num_hidden_layers": 24, |
26 | + "patch_size": 14 |
27 | + } |
28 | + } |
safety_checker/model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:57ecdfa243b170f9b4cb3eefaf0f64552ef78fc0bf0eb1c5b9675308447184f6 |
3 | + size 608016280 |
safety_checker/model.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:fb351a5ded815c3ff744968ad9c6b218d071b9d313d04f35e813b84b4c0ffde8 |
3 | + size 1215979664 |
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@ | |
1 | + { |
2 | + "_class_name": "DDIMScheduler", |
3 | + "_diffusers_version": "0.22.0.dev0", |
4 | + "beta_end": 0.012, |
5 | + "beta_schedule": "scaled_linear", |
6 | + "beta_start": 0.00085, |
7 | + "clip_sample": false, |
8 | + "clip_sample_range": 1.0, |
9 | + "dynamic_thresholding_ratio": 0.995, |
10 | + "num_train_timesteps": 1000, |
11 | + "prediction_type": "epsilon", |
12 | + "rescale_betas_zero_snr": false, |
13 | + "sample_max_value": 1.0, |
14 | + "set_alpha_to_one": false, |
15 | + "steps_offset": 1, |
16 | + "thresholding": false, |
17 | + "timestep_spacing": "leading", |
18 | + "trained_betas": null |
19 | + } |
text_encoder/config.json ADDED
@@ -0,0 +1,24 @@ | |
1 | + { |
2 | + "architectures": [ |
3 | + "CLIPTextModel" |
4 | + ], |
5 | + "attention_dropout": 0.0, |
6 | + "bos_token_id": 0, |
7 | + "dropout": 0.0, |
8 | + "eos_token_id": 2, |
9 | + "hidden_act": "quick_gelu", |
10 | + "hidden_size": 768, |
11 | + "initializer_factor": 1.0, |
12 | + "initializer_range": 0.02, |
13 | + "intermediate_size": 3072, |
14 | + "layer_norm_eps": 1e-05, |
15 | + "max_position_embeddings": 77, |
16 | + "model_type": "clip_text_model", |
17 | + "num_attention_heads": 12, |
18 | + "num_hidden_layers": 12, |
19 | + "pad_token_id": 1, |
20 | + "projection_dim": 768, |
21 | + "torch_dtype": "float16", |
22 | + "transformers_version": "4.34.0", |
23 | + "vocab_size": 49408 |
24 | + } |
text_encoder/model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:df541f6b0bb74513aa7e01eb9a81d95f4e8079f5f77cd52dc1be98f1f7b0e3f2 |
3 | + size 246144152 |
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:f6d39cd6a8049f384d7d440b7c00ec5529dd80d28be1fe83be06009db118da14 |
3 | + size 492265168 |
tokenizer/added_tokens.json ADDED
@@ -0,0 +1,4 @@ | |
1 | + { |
2 | + "<|endoftext|>": 49407, |
3 | + "<|startoftext|>": 49406 |
4 | + } |
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff |
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@ | |
1 | + { |
2 | + "bos_token": "<|startoftext|>", |
3 | + "eos_token": "<|endoftext|>", |
4 | + "pad_token": "<|endoftext|>", |
5 | + "unk_token": "<|endoftext|>" |
6 | + } |
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,32 @@ | |
1 | + { |
2 | + "add_prefix_space": false, |
3 | + "added_tokens_decoder": { |
4 | + "49406": { |
5 | + "content": "<|startoftext|>", |
6 | + "lstrip": false, |
7 | + "normalized": true, |
8 | + "rstrip": false, |
9 | + "single_word": false, |
10 | + "special": false |
11 | + }, |
12 | + "49407": { |
13 | + "content": "<|endoftext|>", |
14 | + "lstrip": false, |
15 | + "normalized": false, |
16 | + "rstrip": false, |
17 | + "single_word": false, |
18 | + "special": false |
19 | + } |
20 | + }, |
21 | + "additional_special_tokens": [], |
22 | + "bos_token": "<|startoftext|>", |
23 | + "clean_up_tokenization_spaces": true, |
24 | + "do_lower_case": true, |
25 | + "eos_token": "<|endoftext|>", |
26 | + "errors": "replace", |
27 | + "model_max_length": 77, |
28 | + "pad_token": "<|endoftext|>", |
29 | + "tokenizer_class": "CLIPTokenizer", |
30 | + "tokenizer_file": "/home/user/.cache/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/32bd64288804d66eefd0ccbe215aa642df71cc41/tokenizer.json", |
31 | + "unk_token": "<|endoftext|>" |
32 | + } |
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff |
unet/config.json ADDED
@@ -0,0 +1,66 @@ | |
1 | + { |
2 | + "_class_name": "UNet2DConditionModel", |
3 | + "_diffusers_version": "0.22.0.dev0", |
4 | + "act_fn": "silu", |
5 | + "addition_embed_type": null, |
6 | + "addition_embed_type_num_heads": 64, |
7 | + "addition_time_embed_dim": null, |
8 | + "attention_head_dim": 8, |
9 | + "attention_type": "default", |
10 | + "block_out_channels": [ |
11 | + 320, |
12 | + 640, |
13 | + 1280, |
14 | + 1280 |
15 | + ], |
16 | + "center_input_sample": false, |
17 | + "class_embed_type": null, |
18 | + "class_embeddings_concat": false, |
19 | + "conv_in_kernel": 3, |
20 | + "conv_out_kernel": 3, |
21 | + "cross_attention_dim": 768, |
22 | + "cross_attention_norm": null, |
23 | + "down_block_types": [ |
24 | + "CrossAttnDownBlock2D", |
25 | + "CrossAttnDownBlock2D", |
26 | + "CrossAttnDownBlock2D", |
27 | + "DownBlock2D" |
28 | + ], |
29 | + "downsample_padding": 1, |
30 | + "dropout": 0.0, |
31 | + "dual_cross_attention": false, |
32 | + "encoder_hid_dim": null, |
33 | + "encoder_hid_dim_type": null, |
34 | + "flip_sin_to_cos": true, |
35 | + "freq_shift": 0, |
36 | + "in_channels": 4, |
37 | + "layers_per_block": 2, |
38 | + "mid_block_only_cross_attention": null, |
39 | + "mid_block_scale_factor": 1, |
40 | + "mid_block_type": "UNetMidBlock2DCrossAttn", |
41 | + "norm_eps": 1e-05, |
42 | + "norm_num_groups": 32, |
43 | + "num_attention_heads": null, |
44 | + "num_class_embeds": null, |
45 | + "only_cross_attention": false, |
46 | + "out_channels": 4, |
47 | + "projection_class_embeddings_input_dim": null, |
48 | + "resnet_out_scale_factor": 1.0, |
49 | + "resnet_skip_time_act": false, |
50 | + "resnet_time_scale_shift": "default", |
51 | + "sample_size": 96, |
52 | + "time_cond_proj_dim": null, |
53 | + "time_embedding_act_fn": null, |
54 | + "time_embedding_dim": null, |
55 | + "time_embedding_type": "positional", |
56 | + "timestep_post_act": null, |
57 | + "transformer_layers_per_block": 1, |
58 | + "up_block_types": [ |
59 | + "UpBlock2D", |
60 | + "CrossAttnUpBlock2D", |
61 | + "CrossAttnUpBlock2D", |
62 | + "CrossAttnUpBlock2D" |
63 | + ], |
64 | + "upcast_attention": null, |
65 | + "use_linear_projection": false |
66 | + } |
unet/diffusion_pytorch_model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:ec41139ac308be8ba691d1606813c7836611f70f87fbd1e76948e7ea191f2e08 |
3 | + size 1719125304 |
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:565de429ddc20d191587e819b35bcb3912ceecc8b7aa00f9ff46c9d1cd2a5da6 |
3 | + size 3438167536 |
vae/config.json ADDED
@@ -0,0 +1,31 @@ | |
1 | + { |
2 | + "_class_name": "AutoencoderKL", |
3 | + "_diffusers_version": "0.22.0.dev0", |
4 | + "act_fn": "silu", |
5 | + "block_out_channels": [ |
6 | + 128, |
7 | + 256, |
8 | + 512, |
9 | + 512 |
10 | + ], |
11 | + "down_block_types": [ |
12 | + "DownEncoderBlock2D", |
13 | + "DownEncoderBlock2D", |
14 | + "DownEncoderBlock2D", |
15 | + "DownEncoderBlock2D" |
16 | + ], |
17 | + "force_upcast": true, |
18 | + "in_channels": 3, |
19 | + "latent_channels": 4, |
20 | + "layers_per_block": 2, |
21 | + "norm_num_groups": 32, |
22 | + "out_channels": 3, |
23 | + "sample_size": 768, |
24 | + "scaling_factor": 0.18215, |
25 | + "up_block_types": [ |
26 | + "UpDecoderBlock2D", |
27 | + "UpDecoderBlock2D", |
28 | + "UpDecoderBlock2D", |
29 | + "UpDecoderBlock2D" |
30 | + ] |
31 | + } |
vae/diffusion_pytorch_model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:9bcf93488ec515e4c968e12c80676ac174fa3d501563aaddb9f09991e6fd04ee |
3 | + size 167335342 |
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@ | |
1 | + version https://git-lfs.github.com/spec/v1 |
2 | + oid sha256:0d31958bd43e93fd1ef7e34a22cbb1275994aebcf6433a06fa3408201a66b494 |
3 | + size 334643268 |