Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

python script to extract chapters #39

Closed
ex-nerd opened this issue Nov 15, 2021 · 7 comments
Closed

python script to extract chapters #39

ex-nerd opened this issue Nov 15, 2021 · 7 comments

Comments

@ex-nerd
Copy link

ex-nerd commented Nov 15, 2021

Do with it as you wish. I still haven't updated this for python3, but it's really handy combined with https://github.com/ex-nerd/audiotools (wow, I feel old looking at the last-updated dates on those) to merge files into m4b files that can load into a dedicated audiobook app on a phone.

#!/usr/bin/env python2
#
# Use with build_m4b from https://github.com/ex-nerd/audiotools
# Due to overdrive low quality, there is no point in encoding aac files
# with better than: 64kbps stereo, HE, optimize for voice
#

import os, sys, re
import mutagen.id3 as id3
from mutagen.mp3 import MP3
from mutagen import File

from collections import OrderedDict

def timestr(secs):
    (secs, ms) = str(secs).split('.')
    ms    =  float(ms[0:3] + '.' + ms[3:])
    secs  = int(secs)
    hours = int(secs // 3600)
    secs  = secs % 3600
    mins  = int(secs // 60)
    secs  = secs % 60
    return '{0:02}:{1:02}:{2:02}.{3:03.0f}'.format(hours, mins, secs, ms)

def load_mp3(total, dir, file):
    path = os.path.join(dir, file)
    #mfile = File(path)
    #file = File('some.mp3') # mutagen can automatically detect format and type of tags
    #artwork = file.tags['APIC:'].data # access APIC frame and grab the image
    #with open('image.jpg', 'wb') as img:
    #    img.write(artwork) # write artwork to new image
    #artwork = mfile.tags['APIC:'].data # access APIC frame and grab the image
    #with open('{0}.jpg'.format(path), 'wb') as img:
    #    img.write(artwork) # write artwork to new image
    audio = MP3(path)
    print(audio.info.length) #, audio.info.bitrate
    m = id3.ID3(path)

    data = m.get('TXXX:OverDrive MediaMarkers')
    if not data:
        print("Can't find TXXX data point for {0}".format(file))
        print(m.keys())
        return
    info = data.text[0].encode("ascii", "ignore")
    #print info
    file_chapters = re.findall(r"<Name>\s*([^>]+?)\s*</Name><Time>\s*([\d:.]+)\s*</Time>", info, re.MULTILINE)
    chapters = []
    for chapter in file_chapters:
        (name, length) = chapter
        name = re.sub(r'^"(.+)"$', r'\1', name)
        name = re.sub(r'^\*(.+)\*$', r'\1', name)
        name = re.sub(r'\s*\([^)]*\)$', '', name) # ignore any sub-chapter markers from Overdrive
        name = re.sub(r'\s+\(?continued\)?$', '', name) # ignore any sub-chapter markers from Overdrive
        name = re.sub(r'\s+-\s*$', '', name)      # ignore any sub-chapter markers from Overdrive
        name = re.sub(r'^Dis[kc]\s+\d+\W*$', '', name)  # ignore any disk markers from Overdrive
        name = name.strip()
        t_parts = list(length.split(':'))
        t_parts.reverse()
        seconds = total + float(t_parts[0])
        if len(t_parts) > 1:
            seconds += (int(t_parts[1]) * 60)
        if len(t_parts) > 2:
            seconds += (int(t_parts[2]) * 60 * 60)
        chapters.append([name, seconds])
        print(name, seconds)
        #chapters = re.search(r'(\w+)', info)
    #print(repr(chapters))
    return (total + audio.info.length, chapters)
    return


    # try:
    #     if file.decode("utf-8") == new.decode("utf-8"):
    #         new = None
    # except:
    #     print "  FILE:  "+os.path.join(dirname, file)
    #     raise
    # # Return
    # return (m, new, changed)

def visit(arg, dirname, names):
    print(dirname)
    os.chdir(dirname)
    #parent = os.path.dirname(dirname)
    #thisdir = os.path.basename(dirname)
    #print thisdir
    # Parse the files
    total = 0;
    all_chapters = OrderedDict()
    for file in sorted(names):
        if file.endswith('.mp3'):
            (total, chapters) = load_mp3(total, dirname, file)
            for chapter in chapters:
                if chapter[0] in all_chapters.keys():
                    continue
                all_chapters[chapter[0]] = chapter[1]
    if len(all_chapters) > 0:
        with open('overdrive_chapters.txt', 'w') as file:
            for name, length in all_chapters.items():
                chapstr = u'{0} {1}'.format(timestr(length), name)
                print(chapstr)
                file.write(chapstr + '\n')
    #print(repr(all_chapters))



if len(sys.argv) > 1:
    path = os.path.abspath(sys.argv[1])
else:
    path = os.path.abspath('.')
print(path)

os.path.walk(path, visit, None)
@ex-nerd
Copy link
Author

ex-nerd commented Oct 5, 2022

It's apparently been a long time since I've used this script. Anyway, here's an updated version for python3:

#!/usr/bin/env python3
#
# Recursively scans current or specified directory for all subdirectories
# containing mp3 files. If these mp3 files contain overdrive chapter markers
# (id3 tag), writes overdrive_chapters.txt to the same directory.
#
# Usage:
#
# extract_overdrive_chapters.py [optional directory path]
#
# Use with build_m4b from https://github.com/ex-nerd/audiotools
#
# Note: Due to overdrive low quality, there is no point in encoding aac files
# with better than: 64kbps stereo, HE, optimize for voice
#

import os, sys, re
import mutagen.id3 as id3
from mutagen.mp3 import MP3
from mutagen import File

from collections import OrderedDict


def timestr(secs):
    (secs, ms) = str(secs).split(".")
    ms = float(ms[0:3] + "." + ms[3:])
    secs = int(secs)
    hours = int(secs // 3600)
    secs = secs % 3600
    mins = int(secs // 60)
    secs = secs % 60
    return f"{hours:02}:{mins:02}:{secs:02}.{ms:03.0f}"


def load_mp3(total, dir, file):
    path = os.path.join(dir, file)
    audio = MP3(path)
    # print(audio.info.length)  # , audio.info.bitrate
    m = id3.ID3(path)

    data = m.get("TXXX:OverDrive MediaMarkers")
    if not data:
        print("Can't find TXXX data point for {0}".format(file))
        print(m.keys())
        return
    info = data.text[0]
    file_chapters = re.findall(
        r"<Name>\s*([^>]+?)\s*</Name><Time>\s*([\d:.]+)\s*</Time>", info, re.MULTILINE
    )
    chapters = []
    for chapter in file_chapters:
        (name, length) = chapter
        name = re.sub(r'^"(.+)"$', r"\1", name)
        name = re.sub(r"^\*(.+)\*$", r"\1", name)
        name = re.sub(
            r"\s*\([^)]*\)$", "", name
        )  # ignore any sub-chapter markers from Overdrive
        name = re.sub(
            r"\s+\(?continued\)?$", "", name
        )  # ignore any sub-chapter markers from Overdrive
        name = re.sub(
            r"\s+-\s*$", "", name
        )  # ignore any sub-chapter markers from Overdrive
        name = re.sub(
            r"^Dis[kc]\s+\d+\W*$", "", name
        )  # ignore any disk markers from Overdrive
        name = name.strip()
        t_parts = list(length.split(":"))
        t_parts.reverse()
        seconds = total + float(t_parts[0])
        if len(t_parts) > 1:
            seconds += int(t_parts[1]) * 60
        if len(t_parts) > 2:
            seconds += int(t_parts[2]) * 60 * 60
        chapters.append([name, seconds])
        # print(name, seconds)
    return (total + audio.info.length, chapters)


def visit(dirname, filenames):
    print(dirname)
    os.chdir(dirname)
    # Parse the files
    total = 0
    all_chapters = OrderedDict()
    for file in sorted(filenames):
        if file.endswith(".mp3"):
            (total, chapters) = load_mp3(total, dirname, file)
            # print(repr(chapters))
            for chapter in chapters:
                if chapter[0] in all_chapters.keys():
                    continue
                all_chapters[chapter[0]] = chapter[1]
    if len(all_chapters) > 0:
        with open("overdrive_chapters.txt", "w") as file:
            for name, length in all_chapters.items():
                chapstr = f"{timestr(length)} {name}"
                print(chapstr)
                file.write(chapstr + "\n")
    # print(repr(all_chapters))


if __name__ == "__main__":

    if len(sys.argv) > 1:
        path = os.path.abspath(sys.argv[1])
    else:
        path = os.path.abspath(".")

    for dirname, dirs, files in os.walk(path, topdown=True):
        dirs[:] = [d for d in dirs if d not in {".git", ".direnv"}]
        visit(dirname, files)

I guess I should update my audiotools scripts for python3, too.

@choc96208
Copy link

choc96208 commented Oct 10, 2022

Hi @ex-nerd, I added this script to audiobook_chapters. Hope you don't mind. I credited you in the references. I then use another script to create a FFMETADATAFILE.

@ex-nerd
Copy link
Author

ex-nerd commented Oct 10, 2022

@choc96208 Sure thing. It's not like me to leave a license off of my code. Consider it MIT (I'll update the comments and https://github.com/ex-nerd/audiotools accordingly)

@chbrown
Copy link
Owner

chbrown commented Dec 12, 2022

Cool, but out of scope, sorry.

@LeLawnGames
Copy link

@ex-nerd slightly dumb question here but I'm still fairly new to all this -- how would I specify a particular directory to run this in?

@ex-nerd
Copy link
Author

ex-nerd commented Jan 25, 2023

@LeLawnGames just the first parameter, e.g. ./extract_overdrive_chapters.py /path/to/directory Or leave off the path and by default it will run in the current directory.

@LeLawnGames
Copy link

oh gotcha -- thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants