简谱识别（开坑）

这是一个中文项目，优先用中文写博客。

将在下周完成项目的计划部分。

这也算是开一个坑，先介绍一下项目目标：

1.可以读取储存在文件中的简谱（第一期工程，done）

可以根据响指的声音，对摄像头内容进行抓拍，自动裁切并增强持谱人手里的乐谱进行储存（第三期工程）

2.对乐谱进行基础识别，这里的基础是指正确识别乐谱的数字和高低音（第一期工程）

对乐谱进行完整的识别，这里的完整指包括休止符，滑音，连音，重音的识别（第二期工程）

对乐谱进行更通用的识别，这里的通用指自动忽略汉字等无效信息，可以识别副歌和节拍部分（第三期工程）

3.对乐谱进行播放和表演。鉴于Range同学的平台具有多样性，可以同时使用Linux和Windows平台，Range将选择两种方式进行表演。

方式零：将裁切识别的结果直接展示（第一期，done），组成新的简化的乐谱（第一期，未完成）

方式一：直接在树莓派使用蜂鸣器进行表演，这也是在Linux平台几乎唯一的可选方式，因为Linux平台没有内置容易调用的MIDI相关API，本着尽量使用原生库的原则，Range决定只在Linux保留这种方案。

方式二：在Windows平台直接使用MIDI接口实现。因为照片截取和实现识别在树莓派4B实行，需要通过网络协议将识别结果传送至Windows，因此这也需要相关的网络协议，更需要Range找到合适的乐曲编码方案将数据打包并且传输。

网络协议Range目前有两种实现方式，第一种也是比较成熟的方案，将树莓派安装MQTT的服务端和客户端，Windows使用客户端和树莓派通信。第二种比较原始，也许不应该叫协议，我们使用c标准库的socket进行字节的传输。

方案三：使用串口和Windows通信，Range不是电子相关学生，不采用。

项目实现：

音符的定位和裁切：目前使用二值化加x方向和y方向投影的方式进行定位，很蠢但是很有效

特殊符号的处理：方案也是有很多。方案一是只识别数字，然后根据数字的比例，对相应数字上下的符号进行裁切，如果是比较长的符号，可以尝试一下写一个栈进行匹配

数据的处理和压缩：使用PCA，百用不厌，Linear Algebra立大功！目测降维到10维效果不错，处理也贼快

分类器和匹配：使用KNN和NCC（手写），使用模板匹配（from opencv）

以上方法都挺蠢，如果有时间会尝试CNN的入门，多读读论文没毛病，有512MB超大显存的树莓派不用白不用

性能要求：

在8G_64BIT_OS的树莓派4B上识别时间不得超过3s

我应该学到什么：

1.设计完整产品和系统，对产品的需求和应用有充分认知。

2.对信息（乐谱）的编码和表示，将是我对编码和效率的第一次探索，纵然没有相关知识和经验，至少可以开动脑筋。

3.对图像识别和ocr算法的进一步学习和总结对比。

4.了解网络的传输和对比。

5.更加熟练的远程coding，使用Linux。

6.使用Windows特有的api，感受Windows多媒体开发的魅力，体验Windows和Linux的差异，加强对二者的认知。

7.在如此多的选择中拍定最终方案，并实现它。

8.主动阅读论文，增加论文气氛和实力。

以下是第一期草稿工程的代码，感谢giegie提供的项目和投影判断的方法

import cv2
from skimage.metrics import structural_similarity as ssim
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
import numpy as np
import imutils

def read_img(img_path):
    """
    This function read img file then strengthen the picture
    args: 
     img_path: you should provide file path here
     The return value threshold_img,img_shape,origin img
    """
    img = cv2.imread(img_path)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, threshold_img = cv2.threshold(gray_img, 120, 1, cv2.THRESH_BINARY_INV)
    img_shape=threshold_img.shape
    return threshold_img,img_shape,img

def project_to_y(threshold_img,img_shape):
    """
    This function project picture to y axis
     args:
     threshold_img
     img_shape
     return edge_inf_y,row
    """
    row=0
    img_hight=img_shape[0]
    img_width=img_shape[1]
    threshold_img_arr = np.uint8(np.array(threshold_img))
    #project the image to y-axis
    y_projection=np.sum(threshold_img_arr,axis=1)
    #find the horizontal line(area)
    in_num_flag=0
    edge_temp=0
    edge_inf_y=[]
    for i in range(img_hight):
        if in_num_flag==0 and y_projection[i]>10:
            in_num_flag=1
            edge_temp=i
        elif in_num_flag==1 and y_projection[i]<10 and (i - edge_temp) >50:
            in_num_flag=0
            #if (i - edge_temp) >50:
            edge_inf_y.append([edge_temp,i+1])
            row=row+1
    return edge_inf_y,row

def project_to_x(threshold_img,img_shape,edge_inf_y,row):
    """
    This function project picture to x axis
     args:
      threshold_img
      img_shape
      return the full inf about edges
    """    
    img_hight=img_shape[0]
    img_width=img_shape[1]
    x_projection=[]
    #first we need to enhance image again
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate_img = cv2.dilate(threshold_img, kernel)
    #find the number row by row
    edge_inf=[]
    for i in range(row):
        cropImg= dilate_img[edge_inf_y[i][0]:edge_inf_y[i][1], 0:img_width]
        crophight,cropwidth=cropImg.shape
        cropImg_arr=np.uint8(np.array(cropImg))
        #project the image to x-axis
        x_projection=np.sum(cropImg_arr,axis=0)
        #find the vertical line(area)
        in_num_flag=0
        edge_temp=0
        for j in range(img_width):
            if in_num_flag==0 and x_projection[j]>9 and x_projection[j]<50:
                in_num_flag=1
                edge_temp=j
            elif in_num_flag==1 and x_projection[j]<9 and (j - edge_temp) >3:
                in_num_flag=0
                edge_inf.append([edge_inf_y[i][0],edge_inf_y[i][1],edge_temp,j+1])
    return edge_inf

def expo_obj(edge_inf,img,save_path):
    """
    export what we recongnized 
    """
    for i in range(len(edge_inf)):
        cropImg= img[edge_inf[i][0]:edge_inf[i][1], edge_inf[i][2]:edge_inf[i][3]]
        cv2.imwrite(str(save_path)+str(i)+".jpg", cropImg)

def ncc(test_char_cmprs,train_char_cmprs):
    """# NCC classifier
    # arg: test_char_cmprs,train_char_cmprs
    # Return value: the name label of the closest number(list)"""
    row_test,_= np.shape(test_char_cmprs)
    row_train,_= np.shape(train_char_cmprs)
    test = float("inf")
    result=[]
    flag=0
    for j in range(row_test):
        test = float("inf")
        for i in range(row_train):
            dist_test = np.sqrt(np.sum(np.square(test_char_cmprs[j,:] - train_char_cmprs[i,:])))
            if dist_test < test:
                test = dist_test 
                flag=i 
        result.append(flag)
    return result

def gen_pca_field(edge_inf,img,numbers,train_path,x,y,r):
    """
    generate two 2-d arrays(reduced dimension,to r)
    "numbers" is the number of training pictures
    x and y present the size of pictures, only if all pictures are same size we can train the data
    """
    #crop pictures and arrange them into 2-d arrary
    gray_img_whole = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, threshold_img = cv2.threshold(gray_img_whole, 120, 255, cv2.THRESH_BINARY_INV)
    test_char=np.zeros((len(edge_inf), x*y))
    for i in range(len(edge_inf)):
        temp_img=threshold_img[edge_inf[i][0]:edge_inf[i][1], edge_inf[i][2]:edge_inf[i][3]]
        char_rcgnz=cv2.resize(temp_img,(x, y))
        img_Vector = np.reshape(char_rcgnz,(1,x*y)) 
        test_char[i,:]=(img_Vector)
    #then read the train pictures and arrange them into 2-d arrary
    train_char=np.zeros((numbers, x*y))
    for i in range(numbers):
        train_read = cv2.imread(str(train_path)+str(i)+".jpg")
        gray_img = cv2.cvtColor(train_read, cv2.COLOR_BGR2GRAY)
        _, threshold_img = cv2.threshold(gray_img, 120, 255, cv2.THRESH_BINARY_INV)
        char_rcgnz=cv2.resize(threshold_img,(x, y))
        img_Vector = np.reshape(char_rcgnz,(1,x*y)) 
        train_char[i,:]=(img_Vector)
    #do pca
    pca = PCA(n_components=r)
    pca.fit(train_char)
    test_char_cmprs = pca.transform(test_char)
    train_char_cmprs = pca.transform(train_char)
    return test_char_cmprs,train_char_cmprs


if __name__ == '__main__':
    img_path="D:\\pythonProject4\\jianpu3.jpg"
    #img_path="/home/range/Code/pythonProject4/jianpu3.jpg"
    #img_path="D:\\pythonProject4\\xunlian\\xunlian4.jpg"
    save_path="D:\\pythonProject4\\save\\"  
    #read picture
    threshold_img,img_shape,img=read_img(img_path)
    #project to y&x axis
    edge_inf_y,row=project_to_y(threshold_img,img_shape)
    edge_inf=project_to_x(threshold_img,img_shape,edge_inf_y,row)
    #import recognized characters into files
    expo_obj(edge_inf,img,save_path) 
    #Do pca
    test_char_cmprs,train_char_cmprs=gen_pca_field(edge_inf,img,len(edge_inf),save_path,61,165,10)
    #result
    result=ncc(test_char_cmprs,train_char_cmprs)
    for i in range(len(result)):
        temp = cv2.imread(str(save_path)+str(result[i])+".jpg")
        cv2.imshow(str(i),temp)
    #show 
    rectangle_img=img
    for i in range(len(edge_inf)):
        rectangle_img = cv2.rectangle(rectangle_img, (edge_inf[i][2], edge_inf[i][0]), (edge_inf[i][3], edge_inf[i][1]),(255, 0, 0), thickness=1)
    cv2.namedWindow('Rectangle Image', 0)
    cv2.resizeWindow('Rectangle Image', 600, 500)
    cv2.imshow("Rectangle Image", rectangle_img)
    cv2.waitKey(0)

目前项目存在的致命问题：

1.只能识别那一张图片，参数很死。

2.除了把音符排开，没有任何的展示。

优点：很快，高效（啊对对对）

我想，纵然再没有技术含量，自己去完成一件自己选择的，足够复杂的事情，并且主动做到尽善尽美，也是对自己的磨练。也许别人一周就做出来了，我需要一个月。这也没有关系，这不是我意志完全可控的，也是人外有人，天外有天。希望心中极客之火和科技优化生活的信念永不熄灭。人生不折腾还有啥乐趣，还学啥自己的专业。High on life, bro!

临时计划和方案：

方案一：实现参数自动化，采用百分比的方式，并且给判断分支给予更大的判断空间，基本做到正确裁切，可以自适应大部分简谱，其中类A4效果最好。效果如下

#Numbered musical notation recognization
import cv2
from skimage.metrics import structural_similarity as ssim
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import os
import numpy as np
import imutils

def read_img(img_path):
    """
    This function read img file then strengthen the picture
    args: 
     img_path: you should provide file path here
     The return value threshold_img,img_shape,origin img
    """
    img = cv2.imread(img_path)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, threshold_img = cv2.threshold(gray_img, 120, 1, cv2.THRESH_BINARY_INV)
    img_shape=threshold_img.shape
    return threshold_img,img_shape,img

def project_to_y(threshold_img,img_shape,rate_bar,rate_hight):
    """
    This function project picture to y axis
     args:
     threshold_img
     img_shape
     return edge_inf_y,row
    """
    row=0
    img_hight=img_shape[0]
    img_width=img_shape[1]
    bar_width=int(rate_bar*img_width)
    char_hight=int(rate_hight*img_hight)
    threshold_img_arr = np.uint8(np.array(threshold_img))
    #project the image to y-axis
    y_projection=np.sum(threshold_img_arr,axis=1)
    """print("inf of y")
    for i in range(len(y_projection)):
        print(y_projection[i])"""
    #find the horizontal line(area)
    in_num_flag=0
    edge_temp=0
    edge_inf_y=[]
    for i in range(img_hight):
        if in_num_flag==0 and y_projection[i]>bar_width:
            in_num_flag=1
            edge_temp=i
        elif in_num_flag==1 and y_projection[i]<bar_width and (i - edge_temp) >char_hight:
            in_num_flag=0
            #if (i - edge_temp) >50,10:
            edge_inf_y.append([edge_temp,i+int(char_hight*0.2)])
            row=row+1
            if (i*2 - edge_temp)<img_hight:
                i=i*2 - edge_temp
    return edge_inf_y,row

def project_to_x(threshold_img,img_shape,edge_inf_y,row):
    """
    This function project picture to x axis
     args:
      threshold_img
      img_shape
      return the full inf about edges
    """    
    img_hight=img_shape[0]
    img_width=img_shape[1]
    x_projection=[]
    #first we need to enhance image again
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    dilate_img = cv2.dilate(threshold_img, kernel)
    #find the number row by row
    edge_inf=[]
    for i in range(row):
        cropImg= dilate_img[edge_inf_y[i][0]:edge_inf_y[i][1], 0:img_width]
        crophight,cropwidth=cropImg.shape
        cropImg_arr=np.uint8(np.array(cropImg))
        #project the image to x-axis
        x_projection=np.sum(cropImg_arr,axis=0)
        #find the vertical line(area)
        in_num_flag=0
        edge_temp=0
        for j in range(img_width):
            if in_num_flag==0 and x_projection[j]>(int(crophight*0.20)) and x_projection[j]<(int(crophight*0.68)):
                in_num_flag=1
                edge_temp=j
            elif in_num_flag==1 and x_projection[j]<(int(crophight*0.25)) and (j - edge_temp) >(int(cropwidth*0.01)):
                in_num_flag=0
                edge_inf.append([edge_inf_y[i][0],edge_inf_y[i][1],edge_temp,j+1])
    return edge_inf

def expo_obj(edge_inf,img,save_path):
    """
    export what we recongnized 
    """
    for i in range(len(edge_inf)):
        cropImg= img[edge_inf[i][0]:edge_inf[i][1], edge_inf[i][2]:edge_inf[i][3]]
        cv2.imwrite(str(save_path)+str(i)+".jpg", cropImg)

def ncc(test_char_cmprs,train_char_cmprs):
    """# NCC classifier
    # arg: test_char_cmprs,train_char_cmprs
    # Return value: the name label of the closest number(list)"""
    row_test,_= np.shape(test_char_cmprs)
    row_train,_= np.shape(train_char_cmprs)
    test = float("inf")
    result=[]
    flag=0
    for j in range(row_test):
        test = float("inf")
        for i in range(row_train):
            dist_test = np.sqrt(np.sum(np.square(test_char_cmprs[j,:] - train_char_cmprs[i,:])))
            if dist_test < test:
                test = dist_test 
                flag=i 
        result.append(flag)
    return result

def gen_pca_field(edge_inf,img,numbers,train_path,x,y,r):
    """
    generate two 2-d arrays(reduced dimension,to r)
    "numbers" is the number of training pictures
    x and y present the size of pictures, only if all pictures are same size we can train the data
    """
    #crop pictures and arrange them into 2-d arrary
    gray_img_whole = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, threshold_img = cv2.threshold(gray_img_whole, 100, 255, cv2.THRESH_BINARY_INV)
    test_char=np.zeros((len(edge_inf), x*y))
    for i in range(len(edge_inf)):
        temp_img=threshold_img[edge_inf[i][0]:edge_inf[i][1], edge_inf[i][2]:edge_inf[i][3]]
        char_rcgnz=cv2.resize(temp_img,(x, y))
        img_Vector = np.reshape(char_rcgnz,(1,x*y)) 
        test_char[i,:]=(img_Vector)
    #then read the train pictures and arrange them into 2-d arrary
    train_char=np.zeros((numbers, x*y))
    for i in range(numbers):
        train_read = cv2.imread(str(train_path)+str(i)+".jpg")
        gray_img = cv2.cvtColor(train_read, cv2.COLOR_BGR2GRAY)
        _, threshold_img = cv2.threshold(gray_img, 120, 255, cv2.THRESH_BINARY_INV)
        char_rcgnz=cv2.resize(threshold_img,(x, y))
        img_Vector = np.reshape(char_rcgnz,(1,x*y)) 
        train_char[i,:]=(img_Vector)
    #do pca
    pca = PCA(n_components=r)
    pca.fit(train_char)
    test_char_cmprs = pca.transform(test_char)
    train_char_cmprs = pca.transform(train_char)
    return test_char_cmprs,train_char_cmprs


if __name__ == '__main__':
    img_path="D:\\NMN\\NMN7.jpg"
    #img_path="/home/range/Code/pythonProject4/jianpu3.jpg"
    #img_path="D:\\pythonProject4\\xunlian\\xunlian4.jpg"
    save_path="D:\\NMN\\save\\"  
    #read picture
    threshold_img,img_shape,img=read_img(img_path)
    print("This is img size")
    print(img_shape)
    #project to y&x axis
    edge_inf_y,row=project_to_y(threshold_img,img_shape,0.001,0.018)
    edge_inf=project_to_x(threshold_img,img_shape,edge_inf_y,row)
    #import recognized characters into files
    expo_obj(edge_inf,img,save_path) 
    #Do pca
    test_char_cmprs,train_char_cmprs=gen_pca_field(edge_inf,img,len(edge_inf),save_path,61,165,10)
    #result
    result=ncc(test_char_cmprs,train_char_cmprs)
    for i in range(len(result)):
        temp = cv2.imread(str(save_path)+str(result[i])+".jpg")
        #cv2.imshow(str(i),temp)
    #show 
    rectangle_img=img
    for i in range(len(edge_inf)):
        rectangle_img = cv2.rectangle(rectangle_img, (edge_inf[i][2], edge_inf[i][0]), (edge_inf[i][3], edge_inf[i][1]),(255, 0, 0), thickness=1)
    cv2.namedWindow('Rectangle Image', 0)
    cv2.resizeWindow('Rectangle Image', 600, 500)
    cv2.imshow("Rectangle Image", rectangle_img)
    cv2.waitKey(0)

可以得出结论，这种投影的方式已经到达顶峰，虽然乍一看还不错，能完成切割的任务，但是给后期的识别带来了很大困难。比如因为连音的问题，相同的数字可能有着不同的裁切范围，会干扰判断，其次将字符完整处理，给数据库添加标签也很有难度，毕竟相同的数字和符号的组合也有很多种，绝对不是最优解。另外参数难以把握，无法给不同制谱软件给出普适性的识别方案（有的音节符号长于低音，有的短），此方案结束，开始尝试下一方案。

2.方案2，从数字入手，先识别数字

简谱识别（开坑）

By ZIXUAN ZHU

Leave a Reply Cancel reply

简谱识别（开坑）

By ZIXUAN ZHU

Related Post

appimage sandbox

Protected: Dev Container Example

2025 OpenCamp-Docker 01

Leave a Reply Cancel reply