#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0) -> (d0 ceildiv 256)>
module  {
  func @pointwise_conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %KH = memref.dim %arg1, %c0 : memref<?x?x?x?xf32> // FH
    %KW = memref.dim %arg1, %c1 : memref<?x?x?x?xf32> // FW
    %KC = memref.dim %arg1, %c2 : memref<?x?x?x?xf32> // FC
    %ON = memref.dim %arg2, %c1 : memref<?x?x?x?xf32> // ON
    %OH = memref.dim %arg2, %c1 : memref<?x?x?x?xf32> // OH
    %OW = memref.dim %arg2, %c0 : memref<?x?x?x?xf32> // OW

    %OF = memref.dim %arg2, %c2 : memref<?x?x?x?xf32> // OF

    affine.for %on = #map0(%c0) to #map(%ON) {           // on : 0-on(batch)
        affine.for %of = #map0(%c0) to #map0(%OF) {          // of : 0-of

            affine.for %kc = #map0(%c0) to #map0(%KC) {          // kc : 0-kc

                affine.for %oh = #map0(%c0) to #map0(%OH) {       // a3 : 0-oh
                    affine.for %fh = #map0(%c0) to #map0(%KH) {     // a4 : 0-fh
                        affine.for %fw = #map0(%c0) to #map0(%KW) {   // a5 : 0-fw
                            affine.for %ow_256 = #map0(%c0) to #map1(%OW) { // a6 : 0-up[ow/256]
                                // f4 = vector.load(filter[fh,fw,?fc,of]) (对于所有kc只有一个值of)
                                %4 = affine.vector_load %arg1[%c0, %c0, %kc, %of] : memref<1x1x?x?xf32>, vector<1xf32> 	
                                // vec.bcast(vector.load(filter[fh,fw])) 1-256
                                %5 = vector.broadcast %4 : vector<1xf32> to vector<256xf32>
                                // %6=vec.load256(img[on, fh+oh, ow+fw*256,kc])
                                %6 = affine.vector_load %arg0[%on, %oh + %fh, %fw + %ow_256 * 256] : memref<?x?x?x?xf32>, vector<256xf32>
                                // fi = vector.load(out[on, oh, up(ow/256)*256], of)
                                %7 = affine.vector_load %arg2[%on, %oh, %ow_256 * 256, %of] : memref<?x?x?x?xf32>, vector<256xf32>
                                // vec.fma(vec.load256(img[fh+oh, fw+ow*256])*vec.bcast(vector.load(filter[fh,fw]))
                                //                                         +vec.load(out[ow, up(ow/256)*256]))
                                %8 = vector.fma %6, %5, %7 : vector<256xf32>
                                // out[oh,up(ow/256)*256]
                                affine.vector_store %8, %arg2[%on, %oh, %ow_256 * 256, %of] : memref<?x?xf32>, vector<256xf32>    

