Here is a quick'n'dirty solution:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def indexFilesFromPath(dir, idxPath): | |
initVM(CLASSPATH) | |
writer = IndexWriter(idxPath, StandardAnalyzer(), True) | |
count = 1 | |
for root, dirs, files in os.walk(dir): | |
for file in files: | |
#print "- "+str(root)+" "+file | |
sfx = file[-3:] | |
if sfx.lower() not in [ "mp3","mp4", "ogg", "flac" ]: | |
continue | |
filepath = os.path.join(root, file) | |
try: | |
doc = Document() | |
f = tagpy.FileRef(filepath) | |
title = unicode(f.tag().title) | |
artist = unicode(f.tag().artist) | |
album = unicode(f.tag().album) | |
genre = unicode(f.tag().genre) | |
path = unicode(filepath) | |
doc.add(Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("artist", artist, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("album",album, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("genre",genre, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("all", album + u" " + artist + u" " + title, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED)) | |
print "\r %5d files indexed"%count, # (%s/%s/%s)..."%(count, f.tag().artist, f.tag().album. f.tag().title) | |
count+=1 | |
writer.addDocument(doc) | |
except Exception, e: | |
#print str(e) | |
# print "+" | |
continue | |
print "Done %d"%count | |
writer.optimize() | |
writer.close() |
The search function is quite simple too:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def search( searchRequest, idxPath): | |
initVM(CLASSPATH) | |
fsDir = FSDirectory.getDirectory(idxPath, False) | |
searcher = IndexSearcher(fsDir) | |
language = StandardAnalyzer() | |
queryp = QueryParser('all', language) | |
query = queryp.parse(searchRequest) # "title", language) | |
hits = searcher.search(query) | |
print "# Found %d hits for %s"%(len(hits), searchRequest) | |
for i in range(0, hits.length()): | |
doc = hits.doc(i) | |
#print u"# %s - %s - %s"%(doc.getField('artist'), doc.getField('album'), doc.getField('title')) | |
print u"%s"%unicode(doc.getField('path').stringValue()) | |
A quick demo:
Indexing music database:
time ./tsearch.py index /home/fv/music/SANE /home/fv/musicindex 16701 files indexed Done 16702 ./tsearch.py index /home/fv/music/SANE /home/fv/musicindex 21,67s user 10,18s system 3% cpu 14:14,36 total
Searching:
time ./tsearch.py search "love OR hate" /home/fv/musicindex > playlist.m3u ./tsearch.py search "love OR hate" /home/fv/musicindex 0,52s user 0,09s system 14% cpu 4,310 total
A more complex search:
time ./tsearch search "love OR hate OR (title: rain in blood AND artist: slayer)" /home/fv/musicindex > playlist.m3u ./tsearch.py search /home/fv/musicindex 0,48s user 0,07s system 74% cpu 0,730 total
Although this code sample is not perfect but consider it more as a proof of concept than a ready to use solution.
The full code:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.5 | |
# vim:set fileencoding=utf-8 | |
import os | |
import sys | |
import tagpy | |
from lucene import \ | |
Document, IndexSearcher, FSDirectory, MultiFieldQueryParser, QueryParser, StandardAnalyzer, IndexWriter, \ | |
StringReader, IndexReader, MoreLikeThis, Term, TermQuery, BooleanQuery,BooleanClause , Field, initVM, CLASSPATH | |
def indexFilesFromPath(dir, idxPath): | |
initVM(CLASSPATH) | |
writer = IndexWriter(idxPath, StandardAnalyzer(), True) | |
count = 1 | |
for root, dirs, files in os.walk(dir): | |
for file in files: | |
#print "- "+str(root)+" "+file | |
sfx = file[-3:] | |
if sfx.lower() not in [ "mp3","mp4", "ogg", "flac" ]: | |
continue | |
filepath = os.path.join(root, file) | |
try: | |
doc = Document() | |
f = tagpy.FileRef(filepath) | |
title = unicode(f.tag().title) | |
artist = unicode(f.tag().artist) | |
album = unicode(f.tag().album) | |
genre = unicode(f.tag().genre) | |
path = unicode(filepath) | |
doc.add(Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("artist", artist, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("album",album, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("genre",genre, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("all", album + u" " + artist + u" " + title, Field.Store.YES, Field.Index.TOKENIZED)) | |
doc.add(Field("path", path, Field.Store.YES, Field.Index.UN_TOKENIZED)) | |
print "\r %5d files indexed"%count, # (%s/%s/%s)..."%(count, f.tag().artist, f.tag().album. f.tag().title) | |
count+=1 | |
writer.addDocument(doc) | |
except Exception, e: | |
#print str(e) | |
# print "+" | |
continue | |
print "Done %d"%count | |
writer.optimize() | |
writer.close() | |
def search( searchRequest, idxPath): | |
initVM(CLASSPATH) | |
fsDir = FSDirectory.getDirectory(idxPath, False) | |
searcher = IndexSearcher(fsDir) | |
language = StandardAnalyzer() | |
queryp = QueryParser('all', language) | |
query = queryp.parse(searchRequest) # "title", language) | |
hits = searcher.search(query) | |
print "# Found %d hits for %s"%(len(hits), searchRequest) | |
for i in range(0, hits.length()): | |
doc = hits.doc(i) | |
#print u"# %s - %s - %s"%(doc.getField('artist'), doc.getField('album'), doc.getField('title')) | |
print u"%s"%unicode(doc.getField('path').stringValue()) | |
if __name__ == "__main__": | |
if sys.argv[1] == "index": | |
indexFilesFromPath(sys.argv[2], sys.argv[3]) | |
elif sys.argv[1] == "search": | |
search(sys.argv[2], sys.argv[3]) | |
The next step is to clean my audio files tags by retrieving tags from Last.fm webservice and put them in the "genre" tag. Then automatically retrieve songs lyrics and index it using the same method.