index.html

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">

    <meta property="og:title" content="Is One GPU Enough? Pushing Image Generation at Higher-Resolutions with Foundation Models"/>
    <meta property="og:description" content="Pixelsmith official website"/>
    <meta property="og:url" content="https://thanos-db.github.io/Pixelsmith"/>
    <meta property="og:image" content="./images/favicon.ico" />
    <meta property="og:image:width" content="512"/>
    <meta property="og:image:height" content="512"/>

    <!-- PNG icons for modern browsers -->
    <link rel="icon" type="image/png" sizes="64x64" href="./images/favicon.ico">

    <!-- Apple Touch Icon for iOS devices -->
    <link rel="apple-touch-icon" sizes="64x64" href="./images/favicon.ico">
  

    <link rel="stylesheet" href="./css/bulma.min.css">
    <link rel="stylesheet" href="./css/fontawesome.all.min.css">
    <link rel="stylesheet" href="./css/containers.css">
    <link rel="stylesheet" href="./css/style.css">
    <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
    <script>
        window.MathJax = {
            tex: {
                inlineMath: [['$', '$'], ['\\(', '\\)']],
                displayMath: [['$$', '$$'], ['\\[', '\\]']]
            },
            svg: {
                fontCache: 'global'
            }
        };
    </script>
    <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>

    <link rel="preload" href="./images/galaxy_giga.jpg" as="image">
</head>
<body>


<div class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    <h1 class="title is-1 publication-title">Is One GPU Enough? Pushing Image Generation at Higher-Resolutions with Foundation Models</h1>
                    <div class="is-size-5 publication-authors">
                        <span class="glasgow">
                                <a href="https://scholar.google.com/citations?user=_acS9zAAAAAJ&hl=it&oi=ao" target="_blank">Athanasios Tragakis</a><sup>*,1</sup>,</span>
                            <span class="dotphoton">
                                <a href="https://marcoaversa.github.io" target="_blank">Marco Aversa</a><sup>*,2</sup>,</span>
                            <span class="glasgow">
                                <a href="https://scholar.google.com/citations?user=GAGMBAwAAAAJ&hl=it&oi=ao" target="_blank">Chaitanya Kaul</a><sup>1</sup>,<br>
                            </span>
                            </span>
                            <span class="glasgow">
                                <a href="https://scholar.google.com/citations?hl=it&user=laX7LzQAAAAJ" target="_blank">Roderick Murray-Smith</a><sup>1</sup>,</span>
                            </span>
                            <span class="glasgow">
                                <a href="https://scholar.google.com/citations?hl=it&user=MsPIYAoAAAAJ" target="_blank">Daniele Faccio</a><sup>1</sup>,</span>
                        </span>
                    </div>
            
                    <div class="is-size-5 publication-authors">
                        <span class="glasgow" style="font-size: 80%;"><sup>1</sup>University of Glasgow</span>
                        <span class="dotphoton" style="font-size: 80%;"><sup>2</sup>Dotphoton</span>
                        <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
                        <span class="neurips"><br><br><b>NeurIPS 2024</b></span>
                    </div> 
                    <div class="column has-text-centered">
                        <br>
                        <div class="publication-links">

                            <span class="link-block">
                                <a href="./paper/pixelsmith.pdf" target="_blank"
                                class="external-link button is-normal is-rounded is-dark">
                                <span>Paper</span>
                                </a>
                            </span>
        
                            <!-- Github link -->
                            <span class="link-block">
                                <a href="https://github.com/Thanos-DB/Pixelsmith" target="_blank"
                                class="external-link button is-normal is-rounded is-dark">
                                <span>Code</span>
                            </a>
                            </span>
                        
                            <!-- ArXiv abstract Link -->
                            <span class="link-block">
                                <a href="https://arxiv.org/abs/2406.07251" target="_blank"
                                class="external-link button is-normal is-rounded is-dark">
                                <span>arXiv</span>
                                </a>
                            </span>
                            <br><br>
                            <p><small>Star this repository on GitHub to show your support and help others discover it.</small></p>
                            <br>
                            <!-- GitHub Star Button -->
                             <!-- Place this tag in your head or just before your close body tag. -->
                            <script async defer src="https://buttons.github.io/buttons.js"></script>

                            <!-- Place this tag where you want the button to render. -->
                            <a class="github-button"
                            href="https://github.com/Thanos-DB/Pixelsmith"
                            data-icon="octicon-star"
                            data-size="large"
                            data-show-count="true"
                            aria-label="Star Thanos-DB/Pixelsmith on GitHub">Star</a>
                            <!-- <a class="github-button" href="https://github.com/Thanos-DB/Pixelsmith" data-icon="octicon-star" data-size="large" data-show-count="true" aria-label="Star Thanos-DB/Pixelsmith on GitHub">Star</a>
                            <script async defer src="https://buttons.github.io/buttons.js"></script>
                            <iframe src="https://ghbtns.com/github-btn.html?user=twbs&repo=bootstrap&type=star&count=true&size=large" frameborder="0" scrolling="0" width="170" height="30" title="GitHub"></iframe> -->
                        </div>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>


<section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Abstract</h2>
          <div class="content has-text-justified">
            <p>
              In this work, we introduce Pixelsmith, a zero-shot text-to-image generative framework to sample images at higher resolutions with a single GPU with minimal computational resources. 
    We are the first to show that it is possible to scale the output of a pre-trained diffusion model by a factor of 1000, opening the road for gigapixel image generation at no additional cost. Our cascading method uses the image generated at the lowest resolution as a baseline to sample at higher resolutions. For the guidance, we introduce the Slider, a tunable mechanism that fuses the overall structure contained in the first-generated image with enhanced fine details. At each inference step, we denoise patches rather than the entire latent space, minimizing memory demands such that a single GPU can handle the process, regardless of the image's resolution. Our experimental results show that Pixelsmith not only achieves higher quality and diversity compared to existing techniques, but also reduces sampling time and artifacts.
            </p>
          </div>
        </div>
      </div>
    </div>
</section>

<div class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">

                    <h2 class="title is-3">One GPU is enough!</h2>
                    <!-- Carousel Section -->
                    <p>
                        With <strong>Pixelsmith</strong>, you can effortlessly scale pre-trained generative models to generate gigapixel-scale images on a <strong></strong>single Nvidia RTX 3090 (24GB VRAM)</strong>.
                        <br> 
                        <u>Note: For the best experience, it’s recommended to view the website on a computer.</u>
                        <br>        <br>       
                        <h2 class="title is-5">Select an image:</h2>
                    </p>

                    <div class="carousel-container">
                        <div class="images-selection" id="image-carousel">
                            <img src="./images/girl_hd.jpg" data-low-res="./images/girl_hd.jpg" data-high-res="./images/girl_giga.jpg" 
                            data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x16 - 4096x4096 (Ours)" alt="Image 2" onclick="selectImage(this)">
                            
                            <img src="./images/small_world_hd.jpg" data-low-res="./images/small_world_hd.jpg" data-high-res="./images/small_world_giga.jpg" 
                                    data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x16 - 4096x4096 (Ours)" alt="Image 3" onclick="selectImage(this)">
                            
                            <img src="./images/jewels_hd.jpg" data-low-res="./images/jewels_hd.jpg" data-high-res="./images/jewels_giga.jpg"
                                    data-background-text="x1 - 1360x768 (StableDiffusionXL)" data-foreground-text="x16 - 5440x3072 (Ours)" alt="Image 4" onclick="selectImage(this)">
                            
                            <img src="./images/lego_hd.jpg" data-low-res="./images/lego_hd.jpg" data-high-res="./images/lego_giga.jpg"
                                    data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x16 - 4096x4096 (Ours)" alt="Image 5" onclick="selectImage(this)">
                            
                            <img src="./images/racoon_hd.jpg" data-low-res="./images/racoon_hd.jpg" data-high-res="./images/racoon_giga.jpg"
                            data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x16 - 2048x2048 (Ours)" alt="racoon" onclick="selectImage(this)">

                            <img src="./images/fantasy_forest_hd.jpg" data-low-res="./images/fantasy_forest_hd.jpg" data-high-res="./images/fantasy_forest_giga.jpg"
                                    data-background-text="x1 - 1360x768 (StableDiffusionXL)" data-foreground-text="x256 - 21760x12288 (Ours)" alt="Image 6" onclick="selectImage(this)">
                            
                            <img src="./images/heaven_hd.jpg" data-low-res="./images/heaven_hd.jpg" data-high-res="./images/heaven_giga.jpg"
                                    data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x64 - 8192x8192 (Ours)" alt="Image 7" onclick="selectImage(this)">
                            
                            <img src="./images/deniro_hd.jpg" data-low-res="./images/deniro_hd.jpg" data-high-res="./images/deniro_giga.jpg"
                            data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x16 - 4096x4096 (Ours)" alt="Image 13" onclick="selectImage(this)">

                            <img src="./images/mars_landscape_hd.jpg" data-low-res="./images/mars_landscape_hd.jpg" data-high-res="./images/mars_landscape_giga.jpg"
                                    data-background-text="x1 - 1360x768 (StableDiffusionXL)" data-foreground-text="x64 - 10880x6144 (Ours)" alt="Image 8" onclick="selectImage(this)">
                            
                            <img src="./images/van_gogh_hd.jpg" data-low-res="./images/van_gogh_hd.jpg" data-high-res="./images/van_gogh_giga.jpg"
                                    data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x4 - 2048x2048 (Ours)" alt="Image 9" onclick="selectImage(this)">
                            
                            <img src="./images/waterfall_hd.jpg" data-low-res="./images/waterfall_hd.jpg" data-high-res="./images/waterfall_giga.jpg"
                            data-background-text="x1 - 1360x768 (StableDiffusionXL)" data-foreground-text="x16 - 5440x3072 (Ours)" alt="Image 10" onclick="selectImage(this)">

                            <img src="./images/gladiator_hd.png" data-low-res="./images/gladiator_hd.png" data-high-res="./images/gladiator_giga.png"
                            data-background-text="x1 - 1360x768 (StableDiffusionXL)" data-foreground-text="x4 - 2720x1536 (Ours)" alt="Image 11" onclick="selectImage(this)">

                            <img src="./images/zebra_hd.jpg" data-low-res="./images/zebra_hd.jpg" data-high-res="./images/zebra_giga.jpg"
                            data-background-text="x1 - 1024x1024 (StableDiffusionXL)" data-foreground-text="x16 - 4096x4096 (Ours)" alt="Image 12" onclick="selectImage(this)">
                        </div>
                    </div>

                    <!-- New Line / Space between Carousel and Comparison -->
                    <br><br>

                    <!-- Image Comparison Section -->
                    <div class="images-comparison" id="image-comparison">
                        <img id="background-image" class="background-image" src="./images/fantasy_forest_hd.jpg" alt="Background Image">
                        <div class="background-text-box">Background Image</div>
                        
                        <img id="foreground-image" class="foreground-image" src="./images/fantasy_forest_giga.jpg" alt="Foreground Image">
                        <div class="foreground-text-box">Foreground Image</div>
                        
                        <div class="slider" id="slider"></div>
                    </div>

                </div>
            </div>
        </div>
    </div>
</div>

<div class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    <h2 class="title is-3">Methods</h2>

                    <div class="static-carousel-wrapper">
                        <div class="static-carousel-container">
                            <div class="arrow left" id="prev">&#10094;</div>
                            <div id="static-carousel-caption">First, given a conditional prompt $c$, we sample the latent variable $\tilde{z}_0$ and decode it into pixel space as $\tilde{x} = \mathcal{D}_\theta(\tilde{z}_0)$. 
                                <br>In this step, we generate an image with a base resolution of $1024^2$ using StableDiffusionXL.</div>
                            <img id="static-carousel-id" src="./images/steps/step1.png" alt="step1">
                            <div class="arrow right" id="next">&#10095;</div>

                                <!-- Hidden divs containing the images and captions -->
                            <div id="sliderData" style="display: none;">
                                <div data-src="./images/steps/step1.png" data-caption="First, given a conditional prompt $c$, we sample the latent variable $\tilde{z}_0$ and decode it into pixel space as $\tilde{x} = \mathcal{D}_\theta(\tilde{z}_0)$. In this step, we generate an image with a base resolution of $1024^2$ using StableDiffusionXL."></div>
                                <div data-src="./images/steps/step2.png" data-caption="We apply an upsampling algorithm to increase the resolution of the image. The upsampling algorithm leads to a blurred output and lack of additional content. This upsampled image $x^{guid}$ will serve as guidance for our generative process."></div>
                                <div data-src="./images/steps/step3.png" data-caption="We encode the upsampled image in the VAE's latent space $z_0^{guid}=\mathcal{E_\theta}(x^{guid})$. We can easily sample each latent variable of the diffusion process through the forward diffusion process $z^{guid}_t \sim q(z^{guid}_t|z^{guid}_0)$."></div>
                                <div data-src="./images/steps/step4.png" data-caption="The generative process starts from $z_T \sim \mathcal{N}(0,I)$, which has the same dimensions as $z^{guid}_T$. The Slider’s position , indicated by a blue line, determines whether the guidance mechanism  or unguided patch denoising will be applied. The Slider allows control over whether a generated image will be slightly or significantly altered compared to the previous resolution."></div>
                                <div data-src="./images/steps/step5a.png" data-caption="At each timestep, random patches are selected for denoising. This process is repeated until the entire latent space is denoised. Since each pixel is denoised only once per timestep, we track which areas have been denoised and avoid selecting those areas again."></div>
                                <div data-src="./images/steps/step5b.png" data-caption="The guidance mechanism fuses the $\hat{z}^{guid}_t$, $\hat{z}_t$ and $\hat{z}^{guid}_{t-1}$ random patches to generate the $\hat{z}_{t-1}$ patch.First, the $\hat{z}^{guid}_t$ and $\hat{z}_t$ patches are transformed to the Fourier space using a Fast Fourier Transformation ($\mathcal{FFT}$), where their phases $\phi$ are averaged. The phase is then combined with the amplitude, which is then transformed back to the spatial domain."></div>
                                <div data-src="./images/steps/step5c.png" data-caption="One of the main reasons that patch based image generation in diffusion models suffers from artifacts at higher resolutions is that while denoising patches, each patch contains the same text prompt as the prompt condition for the overall image. To address this, we combine the sampled $\hat{z}^{iFFT}_{t}$ with the image guidance $\hat{z}^{guid}_{t}$ using a chess-like mask"></div>
                            </div>
                        </div>
                    </div>

                </div>
            </div>
        </div>
    </div>
</div>


<div class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    <h2 class="title is-3">Slider</h2>
                    <p>
                    Adjusting the Slider position changes the image’s balance between detail and structure. Moving left adds fine details but creates artifacts, while moving right improves overall structure but loses detail. Depending on the image, there is an ideal slider position which offers the best balance, preserving structure and adding extra content.
                    </p>
                    <br>
                    <div class="images-slider-demo">
                        <div class="image-slider-demo-container">
                            <h3 class="title is-5"> Base resolution (1024x1024)</h3>
                            <img src="./images/slider/fox_x1.png" alt="reference" width="512" height="auto">
                        </div>
                        <div class="image-slider-demo-container">
                            <h3 class="title is-5"> Higher resolution (2048x2048)</h3>
                            <img id="image-slider-base" src="./images/slider/fox_x4_25.png" alt="x4" width="512" height="auto">
                        </div>
                    </div>
                    <div id="sliderValue">Slider value: 25</div>
                    <div class="bar-container">
                        <p class="bar-label">Weak guidance</p>
                        <input type="range" min="1" max="49" value="25" id="slider-demo">
                        <p class="bar-label">Strong guidance</p>
                    </div>
                    
                    <!-- <br>
                    <p style="font-size: 12px;">
                        prompt: "raw image, hyperrealistic, giant dark Mononoke monster, ethereal fox, seven tails, translucent dark-red blood-filled firefox, intricately woven ribbon tail, ancestral strength and resilience, atmospheric haze, film grain, cinematic film still, shallow depth of field, highly detailed, high budget, cinemascope, moody epic atmosphere, photorealistic, candid camera, color graded"
                    </p>
                </div> -->
            </div>
        </div>
    </div>
</div>

<div class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    
                    <h2 class="title is-3">Improving the base generation</h2>
                    <p>
                        We present a flexible image generation process where an initial image produced by the base model can be directly enhanced to any higher resolution. 
                        This method eliminates the need for intermediate steps, allowing for seamless generation of higher-resolution images in a single iteration. 
                    </p>
                    <img src="./images/multi_step.png" alt="Multi" style="width: 70%; height: auto;">
                    <br><br>
                    <p>
                        We show a few examples of a two-steps image generation process. <br>
                        Zoom-in to visualize progressively enhanced details (e.g. face, hands, hair)
                        <br><br>       
                    </p>

                    <div id="image-selector">
                        <h3>Select an image:</h3>
                        <br>
                        <button class="external-link button is-normal is-rounded" onclick="selectImage2('image1', this)">Girl</button>
                        <button class="external-link button is-normal is-rounded" onclick="selectImage2('image2', this)">Alien</button>     
                        <button class="external-link button is-normal is-rounded" onclick="selectImage2('image3', this)">Dragon</button>     
                    </div>    
                    <br> 

                    Choose the resolution:          
                    <br><br>  
                    <div id="resolution-selector" style="display: none;">
                        <button class="external-link button is-normal is-rounded" onclick="changeImageResolution('1024', this)">1024×1024</button>
                        <button class="external-link button is-normal is-rounded" onclick="changeImageResolution('2048', this)">2048×2048</button>
                        <button class="external-link button is-normal is-rounded" onclick="changeImageResolution('4096', this)">4096×4096</button>
                    </div>
                    <br><br>  
                
                    <div class="images-comparison" id="zoom-in-image">
                        <img id="main-image" src="'./images/emma_1024.jpg'" alt="Default Image"  style="width: 800px; height: 1360px;">
                        <div id="zoom-lens" class="zoom-lens"></div>
                    </div>
                </div>
            </div>
        </div>
    </div>
</div>


<div class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    
                    <h2 class="title is-3">Gigapixel images on a single GPU</h2>
                    <p>
                        Move the cursor over the image to zoom-in. <br>The left side of the lens displays the 1024×1024px image, while the right side showcases the 32768×32768px gigapixel image.​<br>
                        Note: we resized the gigapixel image to a lower resolution for faster visualization.
                    </p>
                    <div id="giga-container" class="giga-container">
                        <img id="giga-image" src="./images/galaxy_hd.jpg" alt="Gigapixel image" style="width:100%; height:100%;">
                    </div>
                    <div id="giga-lens" class="giga-zoom-lens">
                        <div id="giga-lens-divider"></div> 
                        <div id="giga-lens-left" style="position:absolute; width:50%; height:100%; left:0; top:0;"></div>
                        <div id="giga-lens-right" style="position:absolute; width:50%; height:100%; right:0; top:0;"></div>
                    </div>

                    
                </div>
            </div>
        </div>
    </div>
</div>

<section class="section" id="BibTeX">
    <div class="container is-max-desktop">
      <h2 class="title">BibTeX</h2>
      <pre><code>@misc{tragakis2024gpu,
        title={Is One GPU Enough? Pushing Image Generation at Higher-Resolutions with Foundation Models}, 
        author={Athanasios Tragakis and Marco Aversa and Chaitanya Kaul and Roderick Murray-Smith and Daniele Faccio},
        year={2024},
        eprint={2406.07251},
        archivePrefix={arXiv},
        primaryClass={id='cs.CV' full_name='Computer Vision and Pattern Recognition' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers image processing, computer vision, pattern recognition, and scene understanding. Roughly includes material in ACM Subject Classes I.2.10, I.4, and I.5.'}
  }</code></pre>
    </div>
</section>

<script src="./js/load-website.js"></script>
<script src="./js/zoom.js"></script>
<script src="./js/zoom-giga.js"></script>
<script src="./js/carousel.js"></script>
<script src="./js/static_carousel.js"></script>
<script src="./js/slider.js"></script>

</body>
</html>