index.html

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <meta name="description"
        content="OpenAV-dataset">
    <meta name="keywords" content="OpenAV-dataset">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>OpenAV-dataset</title>

    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
    <link rel="stylesheet" href="./static/css/bulma.min.css">
    <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
    <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
    <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
    <link rel="stylesheet" href="./static/css/index.css">

    <link rel="icon" type="image/png" href="./static/favicon/favicon-16x16.png" sizes="16x16">
    <link rel="icon" type="image/png" href="./static/favicon/favicon-32x32.png" sizes="32x32">
    <link rel="shortcut icon" href="./static/favicon/favicon.ico" type="image/x-icon">

    <link rel="apple-touch-icon" href="./static/favicon/apple-touch-icon.png">
    <link href="https://fonts.googleapis.com/css?family=Merriweather:400,900,900i" rel="stylesheet">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script defer src="./static/js/fontawesome.all.min.js"></script>
    <script src="./static/js/bulma-carousel.min.js"></script>
    <script src="./static/js/bulma-slider.min.js"></script>
    <script src="./static/js/comp-slider.js" defer></script>
    <script src="./static/js/index.js"></script>
</head>
<body>
    <a id="btt-button">
        <svg xmlns="http://www.w3.org/2000/svg" width="30" height="30" viewBox="0 0 24 24">
            <polygon points="12 6.586 3.293 15.293 4.707 16.707 12 9.414 19.293 16.707 20.707 15.293 12 6.586"/>
        </svg>
    </a>

    <section class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
        <div class="columns is-centered">
            <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">
                OpenAV: Dataset for Audio-Visual Voice Control of a Computer for Hand
Disabled People
            </h1>
            <div class="is-size-5 publication-authors">
                <span class="author-block">Authors are hidden for peer review<sup>1</sup></span>
            </div>

            <div class="is-size-5 publication-authors">
                <span class="author-block"><sup>1</sup> Affiliation is hidden for peer review</span>
                <br />
                <span><a href="https://specom2024.ftn.uns.ac.rs/" target="_blank">SPECOM 2024</a> (submitted)</span>
            </div>

            <div class="column has-text-centered">
                <div class="publication-links">
                <!-- PDF Link. -->
                <span class="link-block">
                    <a href="#"
                    target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Paper (coming soon)</span>
                    </a>
                </span>
                <!-- <span class="link-block">
                    <a href="#"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <i class="ai ai-arxiv"></i>
                    </span>
                    <span>arXiv</span>
                    </a>
                </span> -->
                <!-- Code Link. -->
                <span class="link-block">
                    <a href="https://github.com/SMIL-SPCRAS/OpenAV-dataset/tree/main" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <i class="fab fa-github"></i>
                    </span>
                    <span>Code (coming soon)</span>
                    </a>
                </span>
                <!-- Dataset Link. -->
                <span class="link-block">
                    <a href="#"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                        <i class="far fa-images"></i>
                    </span>
                    <span>Data (coming soon)</span>
                    </a>
                </span>
                </div>
            </div>
            </div>
        </div>
        </div>
    </div>
    </section>
    <section class="section" style="padding: 0; margin:0">

        <div class="TODO-section">
            <div class="container is-max-desktop">
                <div class="columns is-centered has-text-centered">
                    <div class="column is-four-fifths">
                        <h2 class="title is-5">TODO List</h2>
                        <div class="content has-text-justified">
                            <svg viewBox="0 0 0 0" style="position: absolute; z-index: -1; opacity: 0;">
                                <defs>
                                    <path id="todo__line" stroke="#363636" d="M21 12.3h280v0.1z" ></path>
                                    <path id="todo__box" stroke="#363636" d="M21 12.7v5c0 1.3-1 2.3-2.3 2.3H8.3C7 20 6 19 6 17.7V7.3C6 6 7 5 8.3 5h10.4C20 5 21 6 21 7.3v5.4"></path>
                                    <path id="todo__check" stroke="#2b8f30" d="M10 13l2 2 5-5"></path>
                                    <circle id="todo__circle" cx="13.5" cy="12.5" r="10"></circle>
                                </defs>
                            </svg>
                            <div class="todo-list">

                            <label class="todo">
                                <input class="todo__state" type="checkbox" />

                                <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 300 25" class="todo__icon">
                                    <use xlink:href="#todo__line" class="todo__line"></use>
                                    <use xlink:href="#todo__box" class="todo__box"></use>
                                    <use xlink:href="#todo__check" class="todo__check"></use>
                                    <use xlink:href="#todo__circle" class="todo__circle"></use>
                                </svg>

                                <div class="todo__text">Data collection</div>
                            </label>

                            <label class="todo">

                                <input class="todo__state" type="checkbox" />

                                <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 300 25" class="todo__icon">
                                <use xlink:href="#todo__line" class="todo__line"></use>
                                <use xlink:href="#todo__box" class="todo__box"></use>
                                <use xlink:href="#todo__check" class="todo__check"></use>
                                <use xlink:href="#todo__circle" class="todo__circle"></use>
                                </svg>

                                <div class="todo__text">Data labelling</div>

                            </label>

                            <label class="todo">
                                <input class="todo__state" type="checkbox" />

                                <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 300 25" class="todo__icon">
                                <use xlink:href="#todo__line" class="todo__line"></use>
                                <use xlink:href="#todo__box" class="todo__box"></use>
                                <use xlink:href="#todo__check" class="todo__check"></use>
                                <use xlink:href="#todo__circle" class="todo__circle"></use>
                                </svg>

                                <div class="todo__text">SPECOM paper submission</div>
                            </label>

                            <label class="todo">
                                <input class="todo__state" type="checkbox" />

                                <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 300 25" class="todo__icon">
                                <use xlink:href="#todo__line" class="todo__line"></use>
                                <use xlink:href="#todo__box" class="todo__box"></use>
                                <use xlink:href="#todo__check" class="todo__check"></use>
                                <use xlink:href="#todo__circle" class="todo__circle"></use>
                                </svg>

                                <div class="todo__text">GitHub page creation</div>
                            </label>

                            <label class="todo">
                                <input class="todo__state" type="checkbox" />

                                <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 300 25" class="todo__icon">
                                <use xlink:href="#todo__line" class="todo__line"></use>
                                <use xlink:href="#todo__box" class="todo__box"></use>
                                <use xlink:href="#todo__check" class="todo__check"></use>
                                <use xlink:href="#todo__circle" class="todo__circle"></use>
                                </svg>

                                <div class="todo__text">arXiv paper submission (after accepting)</div>
                            </label>

                            <label class="todo">
                                <input class="todo__state" type="checkbox" />

                                <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 300 25" class="todo__icon">
                                <use xlink:href="#todo__line" class="todo__line"></use>
                                <use xlink:href="#todo__box" class="todo__box"></use>
                                <use xlink:href="#todo__check" class="todo__check"></use>
                                <use xlink:href="#todo__circle" class="todo__circle"></use>
                                </svg>

                                <div class="todo__text">Release code and models (after accepting) </div>
                            </label>

                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>


        <div class="abstract-section">
            <div class="container is-max-desktop abstract-sect">
                <div class="columns is-centered has-text-centered">
                    <div class="column is-four-fifths">
                        <h2 class="title is-3">Abstract</h2>
                        <div class="content has-text-justified">
                            <p>
                            In recent years, audio-visual speech recognition (AVSR) assistance systems have gained increasing attention from
                            researchers as an important part of human-computer interaction (HCI). The objective of this paper is to further advance the
                            development of assistive technologies in the AVSR field by introducing a multi-modal OpenAV dataset, intended for state-of-the-art neural network model training. The OpenAV is designed to train AVSR models for assistance to persons without
                            hands or with disabilities of their hands or arms in HCI. The dataset could also be useful for ordinary users at hands-free
                            contactless HCI. The dataset currently includes the recordings of 15 speakers with a minimum of 10 recording sessions for
                            each. Along with this we provide a detailed description of the dataset and its collection pipeline. In addition, we evaluate
                            state-of-the-art audio-visual (AV) speech recognition approach and present a baseline recognition results. We also describe
                            the recording methodology, release the recording software to public, as well as open the access to the dataset.
                            </p>
                        </div>
                    </div>
                </div>
            </div>
        </div>

        <div class="dataset-section">
            <div class="container is-max-desktop">
                <div class="dataset">
                    <div class="columns is-centered has-text-centered">
                        <div class="column is-four-fifths">
                            <h2 class="title is-3">OpenAV Dataset Description</h2>
                            <img class="img-method" src="./static/img/dataset.png" alt="method">
                        </div>
                    </div>
                </div>
            </div>
        </div>

        <div class="examples-section">
            <div class="container is-max-desktop">
                <div class="examples">
                    <div class="columns is-centered has-text-centered">
                        <div class="column is-four-fifths">
                            <h2 class="title is-3">Snapshots of the Speakers</h2>
                            <img class="img-method" src="./static/img/examples.png" alt="method">
                        </div>
                    </div>
                </div>
            </div>
        </div>

        <div class="software-section">
            <div class="container is-max-desktop">
                <div class="software">
                    <div class="columns is-centered has-text-centered">
                        <div class="column is-four-fifths">
                            <h2 class="title is-3">Web-Service for Recording</h2>
                            <img class="img-method" src="./static/img/soft.jpg" alt="method">
                        </div>
                    </div>
                </div>
            </div>
        </div>

        <div class="architectures-section">
            <div class="container is-max-desktop">
                <div class="architectures">
                    <div class="columns is-centered has-text-centered">
                        <div class="column is-four-fifths">
                            <h2 class="title is-3"> Audio-Visual Neural Network Model Architecture </h2>
                            <img class="img-method" src="./static/img/architectures.png" alt="method">
                        </div>
                    </div>
                </div>
            </div>
        </div>

        <div class="conclusion-section">
            <div class="container is-max-desktop">
                <div class="conclusion">
                    <div class="columns is-centered has-text-centered">
                        <div class="column is-four-fifths">
                            <h2 class="title is-3">Conclusion</h2>
                            <div class="content has-text-justified">
                                <p>
                                In this paper, we have created a multi-speaker audio-visual dataset OpenAV, designed for state-of-the-art
                                neural network model training and intended for
                                building AVSR assistance systems for people with
                                hands disabilities. The dataset could also be useful
                                for ordinary users at hands-free contactless HCI. The
                                dataset currently includes the recordings of 15
                                speakers with a minimum of 10 recording sessions for
                                each. Along with this we have provided a detailed
                                description of the dataset and its collection pipeline. </br></br>
                                In addition, we have evaluated state-of-the-art audio-visual (AV) speech recognition approach and have
                                presented a baseline recognition results. The fusion
                                of both audio and visual modalities results in an
                                accuracy of <b>91.54%</b>, achieved through a model-level
                                fusion approach. This in terms of recognition accuracy
                                is comparable to the state-of-the-art results achieved
                                for other AV corpora.
                                </p>
                            </div>
                        </div>
                    </div>
                </div>
            </div>
        </div>

    </section>

    <footer class="footer">
        <div class="container">
        <div class="content has-text-centered">
            <a class="icon-link external-link" href="https://github.com/SMIL-SPCRAS" target="_blank">
            <i class="fab fa-github"></i>
            </a>
        </div>
        <div class="columns is-centered">
            <div class="column is-8">
            <div class="content">
                <p>
                This page was built using the <a href="https://github.com/SMIL-SPCRAS/OpenAV-dataset" target="_blank">OpenAV-dataset project page</a>, which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
                You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
                Commons Attribution-ShareAlike 4.0 International License</a>.
                </p>
            </div>
            </div>
        </div>
        </div>
    </footer>
</body>
</html>