diff '--color=auto' -Naur a/xine/adjustable_scr.c b/xine/adjustable_scr.c
--- a/xine/adjustable_scr.c	2023-11-05 16:04:06.000000000 +0100
+++ b/xine/adjustable_scr.c	2024-07-14 12:45:40.101222775 +0200
@@ -273,13 +273,13 @@
       set_pivot( this );
       this->buffering = 1;
       this->buffering_start_time = time_ms();
-      LOGMSG("start buffering at %"PRId64, this->cur_pts);
+      LOGMSG("start buffering at %" PRId64, this->cur_pts);
     }
   } else {
     if (this->buffering) {
       set_pivot( this );
       this->buffering = 0;
-      LOGMSG("stop buffering at %"PRId64" (buffering took %"PRIu64" ms)",
+      LOGMSG("stop buffering at %" PRId64" (buffering took %" PRIu64" ms)",
 	     this->cur_pts, elapsed(this->buffering_start_time));
     }
   }
diff '--color=auto' -Naur a/xine/demux_xvdr.c b/xine/demux_xvdr.c
--- a/xine/demux_xvdr.c	2023-11-05 16:04:06.000000000 +0100
+++ b/xine/demux_xvdr.c	2024-07-14 12:53:38.491572095 +0200
@@ -221,7 +221,7 @@
     int still_mode  = (int)this->stream->metronom->get_option(this->stream->metronom, XVDR_METRONOM_STILL_MODE);
     int trick_speed = (int)this->stream->metronom->get_option(this->stream->metronom, XVDR_METRONOM_TRICK_SPEED);
     if (still_mode > 0 || trick_speed > 0) {
-      LOGMSG("Skipping new pts %"PRId64" (still=%d trickspeed=%d)", buf->pts, still_mode, trick_speed);
+      LOGMSG("Skipping new pts %" PRId64" (still=%d trickspeed=%d)", buf->pts, still_mode, trick_speed);
       return;
     }
   }
@@ -233,7 +233,7 @@
 
     if (this->send_newpts || (this->last_pts[video] && abs(diff)>WRAP_THRESHOLD)) {
 
-      LOGVERBOSE("New PTS: %"PRId64" (%s)", buf->pts, video ? "VIDEO" : "AUDIO");
+      LOGVERBOSE("New PTS: %" PRId64" (%s)", buf->pts, video ? "VIDEO" : "AUDIO");
       if (this->buf_flag_seek) {
         _x_demux_control_newpts(this->stream, buf->pts, BUF_FLAG_SEEK);
         this->buf_flag_seek = 0;
diff '--color=auto' -Naur a/xine/vo_lastpts.c b/xine/vo_lastpts.c
--- a/xine/vo_lastpts.c	2023-11-05 16:04:06.000000000 +0100
+++ b/xine/vo_lastpts.c	2024-07-14 12:45:40.101222775 +0200
@@ -83,7 +83,7 @@
     if (this->xvdr_metronom) {
       ASSERT_RET(this->xvdr_metronom->set_option, return);
 
-      LOGVERBOSE("last pts %"PRId64, vo_img->pts);
+      LOGVERBOSE("last pts %" PRId64, vo_img->pts);
 
       this->xvdr_metronom->set_option(this->xvdr_metronom, XVDR_METRONOM_LAST_VO_PTS, vo_img->pts);
     }
diff '--color=auto' -Naur a/xine/xvdr_metronom.c b/xine/xvdr_metronom.c
--- a/xine/xvdr_metronom.c	2023-11-05 16:04:06.000000000 +0100
+++ b/xine/xvdr_metronom.c	2024-07-14 12:45:40.101222775 +0200
@@ -63,11 +63,11 @@
     int64_t dv = this->vid_pts - this->disc_pts;
     int64_t d_min = min64(da, dv);
     LOGMSG("  stream A-V diff %d ms", (int)(this->vid_pts - this->aud_pts)/90);
-    LOGMSG("  reported stream start at pts %"PRId64, this->disc_pts);
-    LOGMSG("  output fifo end at: audio %"PRId64" video %"PRId64, this->aud_pts, this->vid_pts);
-    LOGMSG("  dA %"PRId64" dV %"PRId64, da, dv);
+    LOGMSG("  reported stream start at pts %" PRId64, this->disc_pts);
+    LOGMSG("  output fifo end at: audio %" PRId64" video %" PRId64, this->aud_pts, this->vid_pts);
+    LOGMSG("  dA %" PRId64" dV %" PRId64, da, dv);
     if (d_min < 0 && d_min > -10*90000) {
-      LOGMSG("  *** output is late %"PRId64" ticks (%"PRId64" ms) ***", d_min, -d_min/90);
+      LOGMSG("  *** output is late %" PRId64" ticks (%" PRId64" ms) ***", d_min, -d_min/90);
       this->scr->jump(this->scr, d_min);
     }
     this->buffering = 0;
diff '--color=auto' -Naur a/xine_input_vdr.c b/xine_input_vdr.c
--- a/xine_input_vdr.c	2023-11-05 16:04:06.000000000 +0100
+++ b/xine_input_vdr.c	2024-07-14 12:48:56.149379516 +0200
@@ -2125,7 +2125,7 @@
       LOGMSG("vdr_flush_engine: guard > curpos, flush skipped");
       return;
     }
-    LOGMSG("vdr_flush_engine: %"PRIu64" < current position %"PRIu64", flush skipped",
+    LOGMSG("vdr_flush_engine: %" PRIu64" < current position %" PRIu64", flush skipped",
            discard_index, this->curpos);
     return;
   }
@@ -4040,7 +4040,7 @@
   mutex_lock_cancellable(&this->lock);
 
   if (this->discard_index < this->discard_index_ds)
-    LOGVERBOSE("wait_stream_sync: waiting for engine_flushed condition %"PRIu64"<%"PRIu64,
+    LOGVERBOSE("wait_stream_sync: waiting for engine_flushed condition %" PRIu64"<%" PRIu64,
                this->discard_index, this->discard_index_ds);
 
   counter = 100;
@@ -4055,7 +4055,7 @@
 
   if (this->discard_index < this->curpos) {
     /* may be less if server-side fifo was cleared */
-    LOGMSG("wait_stream_sync: discard_index %"PRIu64" != curpos %"PRIu64" ! (diff %"PRId64")",
+    LOGMSG("wait_stream_sync: discard_index %" PRIu64" != curpos %" PRIu64" ! (diff %" PRId64")",
            this->discard_index, this->curpos, (int64_t)(this->discard_index - this->curpos));
   }
 
@@ -4065,7 +4065,7 @@
   mutex_unlock_cancellable(&this->lock);
 
   if (synced) {
-    LOGVERBOSE("wait_stream_sync: streams synced at %"PRIu64"/%"PRIu64,
+    LOGVERBOSE("wait_stream_sync: streams synced at %" PRIu64"/%" PRIu64,
                this->discard_index_ds, sync_index);
     return 0;
   }
@@ -4078,7 +4078,7 @@
     errno = EINTR;
   }
   else if (counter <= 0) {
-    LOGMSG("wait_stream_sync: Timed out ! diff %"PRId64,
+    LOGMSG("wait_stream_sync: Timed out ! diff %" PRId64,
            (int64_t)(sync_index - this->discard_index_ds));
     errno = EAGAIN;
   }
diff '--color=auto' -Naur a/xine_post_swscale.c b/xine_post_swscale.c
--- a/xine_post_swscale.c	2023-11-05 16:04:06.000000000 +0100
+++ b/xine_post_swscale.c	2024-07-14 12:45:40.109222750 +0200
@@ -561,26 +561,26 @@
             /* store ebx (PIC) */
             "mov %%"REGB", "_oldbx"          \n\t"
 #endif
-	    "movl   "_src_row_size", %%"REGC"  \n\t"
-	    "shrl   $3,          %%"REGC"      \n\t" /* 8 bytes a time             */
-	    "mov    "_srcp1",    %%"REGSI"     \n\t" /* top of 2 src lines to get  */
-	    "mov    "_srcp2",    %%"REGD"      \n\t" /* next "                     */
-	    "mov    "_vWorkYW",  %%"REGDI"     \n\t" /* luma work destination line */
-	    "mov    "_vWorkUVW", %%"REGB"      \n\t" /* luma work destination line */
-	    "xor    %%"REGA",    %%"REGA"      \n\t"
+	    "movl   "_src_row_size", %%" REGC"  \n\t"
+	    "shrl   $3,          %%" REGC"      \n\t" /* 8 bytes a time             */
+	    "mov    "_srcp1",    %%" REGSI"     \n\t" /* top of 2 src lines to get  */
+	    "mov    "_srcp2",    %%" REGD"      \n\t" /* next "                     */
+	    "mov    "_vWorkYW",  %%" REGDI"     \n\t" /* luma work destination line */
+	    "mov    "_vWorkUVW", %%" REGB"      \n\t" /* luma work destination line */
+	    "xor    %%" REGA",    %%" REGA"      \n\t"
 #if !defined(__x86_64__)
 	    /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
 	     * This first loop is not the performance bottleneck anyway but it is trivial to tune
 	     * using SSE2 if we have proper alignment.
 	     */
 	    "testl  $1, "_SSE2enabledW"  \n\t"  /* is SSE2 supported?*/
-	    "jz     "vMaybeSSEMMX"f      \n\t"  /* n, can't do anyway*/
+	    "jz     " vMaybeSSEMMX"f      \n\t"  /* n, can't do anyway*/
 #endif
-	    "cmpl   $2, %%"REGC"         \n\t"  /* we have at least 16 bytes, 2 qwords? */
-	    "jl     "vMaybeSSEMMX"f      \n\t"  /* n, don't bother*/
+	    "cmpl   $2, %%" REGC"         \n\t"  /* we have at least 16 bytes, 2 qwords? */
+	    "jl     " vMaybeSSEMMX"f      \n\t"  /* n, don't bother*/
 	    
-	    "shrl   $1, %%"REGC"         \n\t"  /* do 16 bytes at a time instead*/
-	    "decl   %%"REGC"             \n"    /* jigger loop ct */
+	    "shrl   $1, %%" REGC"         \n\t"  /* do 16 bytes at a time instead*/
+	    "decl   %%" REGC"             \n"    /* jigger loop ct */
 	    
 	    ".align 16                   \n\t"
 
@@ -589,14 +589,14 @@
 	    "movdqu "_vWeight2", %%xmm6  \n\t"
 	    "movdqu "_YMask",    %%xmm7  \n"
 
-	    ""vLoopSSE2_Fetch":          \n\t"
+	    "" vLoopSSE2_Fetch":          \n\t"
 #ifdef PREFETCH
-	    "  prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
-	    "  prefetcht0 16(%%"REGD",  %%"REGA", 2) \n"
+	    "  prefetcht0 16(%%" REGSI", %%" REGA", 2) \n\t"
+	    "  prefetcht0 16(%%" REGD",  %%" REGA", 2) \n"
 #endif	    
-	    ""vLoopSSE2":  \n\t"
-	    "  movdqu   (%%"REGSI", %%"REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
-	    "  movdqu   (%%"REGD",  %%"REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */
+	    "" vLoopSSE2":  \n\t"
+	    "  movdqu   (%%" REGSI", %%" REGA", 2), %%xmm1 \n\t" /* top of 2 lines to interpolate */
+	    "  movdqu   (%%" REGD",  %%" REGA", 2), %%xmm2 \n\t" /* 2nd of 2 lines */
 
 	    "  movdqa   %%xmm1, %%xmm3  \n\t"  /* get chroma  bytes  */
 	    "  pand     %%xmm7, %%xmm1  \n\t"  /* keep only luma     */
@@ -614,9 +614,9 @@
 	    "  paddusw  %%xmm0, %%xmm1  \n\t"  /* round             */
 	    "  psrlw        $8, %%xmm1  \n\t"  /* right adjust luma */
 #ifdef STREAMING_STORE_TMP
-	    "  movntdq  %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+	    "  movntdq  %%xmm1, (%%" REGDI", %%" REGA", 2) \n\t" /* save lumas in our work area */
 #else
-	    "  movdqu   %%xmm1, (%%"REGDI", %%"REGA", 2) \n\t" /* save lumas in our work area */
+	    "  movdqu   %%xmm1, (%%" REGDI", %%" REGA", 2) \n\t" /* save lumas in our work area */
 #endif
 	    "  paddw    %%xmm4, %%xmm3  \n\t"  /* combine chromas */
 	    "  paddusw  %%xmm0, %%xmm3  \n\t"  /* round */
@@ -624,36 +624,36 @@
 	    "  packuswb %%xmm3, %%xmm3  \n\t"  /* pack UV's into low dword */
 	    "  movdq2q  %%xmm3, %%mm1   \n\t"  /* save in our work area    */
 #ifdef STREAMING_STORE_TMP
-	    "  movntq    %%mm1, (%%"REGB", %%"REGA") \n\t"  /* save in our work area */
+	    "  movntq    %%mm1, (%%" REGB", %%" REGA") \n\t"  /* save in our work area */
 #else
-	    "  movq      %%mm1, (%%"REGB", %%"REGA") \n\t"  /* save in our work area */
+	    "  movq      %%mm1, (%%" REGB", %%" REGA") \n\t"  /* save in our work area */
 #endif
-	    "  lea   8(%%"REGA"), %%"REGA"  \n\t"
-	    "  decl  %%"REGC"               \n\t"
+	    "  lea   8(%%" REGA"), %%" REGA"  \n\t"
+	    "  decl  %%" REGC"               \n\t"
 	    
-	    "  jg    "vLoopSSE2_Fetch"b     \n\t"  /* if not on last one loop, prefetch */
-	    "  jz    "vLoopSSE2"b           \n\t"  /* or just loop, or not */
+	    "  jg    " vLoopSSE2_Fetch"b     \n\t"  /* if not on last one loop, prefetch */
+	    "  jz    " vLoopSSE2"b           \n\t"  /* or just loop, or not */
 
 	    /* done with our SSE2 fortified loop but we may need to pick up the spare change */
 #ifdef STREAMING_STORE_TMP
 	    "  sfence    \n\t"
 #endif
-	    "  movl  "_src_row_size", %%"REGC" \n\t"  /* get count again   */
-	    "  andl  $15, %%"REGC"          \n\t"  /* just need mod 16  */
+	    "  movl  "_src_row_size", %%" REGC" \n\t"  /* get count again   */
+	    "  andl  $15, %%" REGC"          \n\t"  /* just need mod 16  */
 
 	    "  movq  "_YMask",    %%mm7     \n\t"  /* useful luma mask constant - lazy dupl init */
 	    "  movq  "_vWeight1", %%mm5     \n\t"
 	    "  movq  "_vWeight2", %%mm6     \n\t"
 	    "  movq  "_FPround1", %%mm0     \n\t"  /* useful rounding constant  */
 
-	    "  shrl  $3, %%"REGC"     \n\t"  /* 8 bytes at a time, any?  */
-	    "  jz   "MoreSpareChange"f \n"    /* n, did them all  */
+	    "  shrl  $3, %%" REGC"     \n\t"  /* 8 bytes at a time, any?  */
+	    "  jz   " MoreSpareChange"f \n"    /* n, did them all  */
 
 	    /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
 	     * This first loop is not the performance bottleneck anyway but it is trivial to tune
 	     * using SSE if we have proper alignment.
 	     */
-	    ""vMaybeSSEMMX":    \n\t"
+	    "" vMaybeSSEMMX":    \n\t"
 
 	    "  movq  "_YMask",    %%mm7    \n\t"  /* useful luma mask constant - lazy dupl init */
 	    "  movq  "_vWeight1", %%mm5    \n\t"  
@@ -661,19 +661,19 @@
 	    "  movq  "_FPround1", %%mm0    \n\t"  /* useful rounding constant  */
 #if !defined(__x86_64__)
 	    "  testl $1, "_SSEMMXenabledW" \n\t"  /* MMXEXTsupported? */
-	    "  jz    "vLoopMMX"f           \n\t"  /* n, can't do anyway */
+	    "  jz    " vLoopMMX"f           \n\t"  /* n, can't do anyway */
 #endif
-	    "  decl  %%"REGC"              \n"    /* jigger loop ctr */
+	    "  decl  %%" REGC"              \n"    /* jigger loop ctr */
 
 	    ".align 16             \n"
-	    ""vLoopSSEMMX_Fetch":  \n\t"
+	    "" vLoopSSEMMX_Fetch":  \n\t"
 #ifdef PREFETCH
-	    "  prefetcht0 8(%%"REGSI", %%"REGA", 2)  \n\t"
-	    "  prefetcht0 8(%%"REGD",  %%"REGA", 2)  \n"
+	    "  prefetcht0 8(%%" REGSI", %%" REGA", 2)  \n\t"
+	    "  prefetcht0 8(%%" REGD",  %%" REGA", 2)  \n"
 #endif
-	    ""vLoopSSEMMX":   \n\t"
-	    "  movq    (%%"REGSI", %%"REGA", 2), %%mm1  \n\t"   /* top of 2 lines to interpolate */
-	    "  movq    (%%"REGD",  %%"REGA", 2), %%mm2  \n\t"   /* 2nd of 2 lines    */
+	    "" vLoopSSEMMX":   \n\t"
+	    "  movq    (%%" REGSI", %%" REGA", 2), %%mm1  \n\t"   /* top of 2 lines to interpolate */
+	    "  movq    (%%" REGD",  %%" REGA", 2), %%mm2  \n\t"   /* 2nd of 2 lines    */
 
 	    "  movq    %%mm1, %%mm3  \n\t"   /* copy top bytes */
 	    "  pand    %%mm7, %%mm1  \n\t"   /* keep only luma */
@@ -693,30 +693,30 @@
 	    "  paddusw %%mm0, %%mm1  \n\t"   /* round             */
 	    "  psrlw      $8, %%mm1  \n\t"   /* right adjust luma */
 #ifdef STREAMING_STORE_TMP
-	    "  movntq  %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
+	    "  movntq  %%mm1, (%%" REGDI", %%" REGA", 2) \n\t"  /* save lumas in our work area */
 #else
-	    "  movq    %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
+	    "  movq    %%mm1, (%%" REGDI", %%" REGA", 2) \n\t"  /* save lumas in our work area */
 #endif	    
 	    "  paddw    %%mm4, %%mm3  \n\t"  /* combine chromas  */
 	    "  paddusw  %%mm0, %%mm3  \n\t"  /* round            */
 	    "  psrlw       $8, %%mm3  \n\t"  /* right adjust chroma  */
 	    "  packuswb %%mm3, %%mm3  \n\t"  /* pack UV's into low dword */
-	    "  movd     %%mm3, (%%"REGB", %%"REGA") \n\t"  /* save in our work area    */
+	    "  movd     %%mm3, (%%" REGB", %%" REGA") \n\t"  /* save in our work area    */
 	    
-	    "  lea   4(%%"REGA"), %%"REGA" \n\t"
-	    "  decl  %%"REGC"              \n\t"
-	    "  jg    "vLoopSSEMMX_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
-	    "  jz    "vLoopSSEMMX"b        \n\t"  /* or just loop, or not  */
+	    "  lea   4(%%" REGA"), %%" REGA" \n\t"
+	    "  decl  %%" REGC"              \n\t"
+	    "  jg    " vLoopSSEMMX_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
+	    "  jz    " vLoopSSEMMX"b        \n\t"  /* or just loop, or not  */
 #ifdef STREAMING_STORE_TMP
 	    "  sfence                      \n\t"
 #endif
-	    "  jmp    "MoreSpareChange"f   \n"    /* all done with vertical  */
+	    "  jmp    " MoreSpareChange"f   \n"    /* all done with vertical  */
 	    
 	    ".align 16     \n"
-	    ""vLoopMMX":   \n\t"
+	    "" vLoopMMX":   \n\t"
 
-	    "  movq (%%"REGSI", %%"REGA", 2), %%mm1  \n\t" /* top of 2 lines to interpolate */
-	    "  movq (%%"REGD",  %%"REGA", 2), %%mm2  \n\t" /* 2nd of 2 lines */
+	    "  movq (%%" REGSI", %%" REGA", 2), %%mm1  \n\t" /* top of 2 lines to interpolate */
+	    "  movq (%%" REGD",  %%" REGA", 2), %%mm2  \n\t" /* 2nd of 2 lines */
 
 	    "  movq     %%mm1, %%mm3  \n\t"  /* copy top bytes    */
 	    "  pand     %%mm7, %%mm1  \n\t"  /* keep only luma    */
@@ -735,79 +735,79 @@
 	    "  paddw    %%mm2, %%mm1  \n\t"  /* combine lumas     */
 	    "  paddusw  %%mm0, %%mm1  \n\t"  /* round             */
 	    "  psrlw       $8, %%mm1  \n\t"  /* right adjust luma */
-	    "  movq     %%mm1, (%%"REGDI", %%"REGA", 2) \n\t"  /* save lumas in our work area */
+	    "  movq     %%mm1, (%%" REGDI", %%" REGA", 2) \n\t"  /* save lumas in our work area */
 	    
 	    "  paddw    %%mm4, %%mm3  \n\t"  /* combine chromas     */
 	    "  paddusw  %%mm0, %%mm3  \n\t"  /* round               */
 	    "  psrlw       $8, %%mm3  \n\t"  /* right adjust chroma */
 	    "  packuswb %%mm3, %%mm3  \n\t"  /* pack UV's into low dword */
-	    "  movd     %%mm3, (%%"REGB", %%"REGA")  \n\t"  /* save in our work area */
+	    "  movd     %%mm3, (%%" REGB", %%" REGA")  \n\t"  /* save in our work area */
 
-	    "  lea      4(%%"REGA"), %%"REGA"  \n\t"
-	    "  loop     "vLoopMMX"b      \n"
+	    "  lea      4(%%" REGA"), %%" REGA"  \n\t"
+	    "  loop     " vLoopMMX"b      \n"
 
 	    /* Add a little code here to check if we have 2 more pixels to do and, if so, make one
 	     * more pass thru vLoopMMX. We were processing in multiples of 4 pixels and alway have
 	     * an even number so there will never be more than 2 left. trbarry 7/29/2002
 	     */
-	    ""MoreSpareChange":    \n\t"
+	    "" MoreSpareChange":    \n\t"
 
-	    "  cmpl  "_EndOffset", %%"REGEA"  \n\t"  /* did we get them all */
-	    "  jnl   "DoHorizontal"f \n\t"  /* yes, else have 2 left */
-	    "  movl  $1, %%"REGC"    \n\t"  /* jigger loop ct */
-	    "  sub   $2, %%"REGA"    \n\t"  /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
-	    "  jmp   "vLoopMMX"b     \n"
+	    "  cmpl  "_EndOffset", %%" REGEA"  \n\t"  /* did we get them all */
+	    "  jnl   " DoHorizontal"f \n\t"  /* yes, else have 2 left */
+	    "  movl  $1, %%" REGC"    \n\t"  /* jigger loop ct */
+	    "  sub   $2, %%" REGA"    \n\t"  /* back up 2 pixels (4 bytes, but eax carried as 1/2) */
+	    "  jmp   " vLoopMMX"b     \n"
 
 	    /*  We've taken care of the vertical scaling, now do horizontal  */
-	    ""DoHorizontal":      \n\t"
+	    "" DoHorizontal":      \n\t"
 
 	    "  movq  "_YMask",    %%mm7     \n\t"  /* useful 0U0U..  mask constant  */
 	    "  movq  "_FPround2", %%mm6     \n\t"  /* useful rounding constant, dwords  */
-	    "  mov   "_pControl", %%"REGSI" \n\t"  /* @ horiz control bytes  */	
-	    "  movl  "_row_size", %%"REGC"  \n\t"
-	    "  shrl  $2,          %%"REGC"  \n\t"  /* bytes a time, 2 pixels  */
-	    "  mov   "_vWorkYW",  %%"REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
-	    "  mov   "_dstp",     %%"REGDI" \n\t"  /* the destination line  */
-	    "  mov   "_vWorkUVW", %%"REGB"  \n"    /* chroma data, as UVUV UVUV...  */
+	    "  mov   "_pControl", %%" REGSI" \n\t"  /* @ horiz control bytes  */	
+	    "  movl  "_row_size", %%" REGC"  \n\t"
+	    "  shrl  $2,          %%" REGC"  \n\t"  /* bytes a time, 2 pixels  */
+	    "  mov   "_vWorkYW",  %%" REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
+	    "  mov   "_dstp",     %%" REGDI" \n\t"  /* the destination line  */
+	    "  mov   "_vWorkUVW", %%" REGB"  \n"    /* chroma data, as UVUV UVUV...  */
 
 	    ".align 16  \n"
-	    ""hLoopMMX":    \n\t"
+	    "" hLoopMMX":    \n\t"
 
 	    /* x86_64: must use movl (accessing table of uint32's) */
-	    "  movl      16(%%"REGSI"), %%"REGEA"        \n\t"  /* get data offset in pixels, 1st pixel pair */
-	    "  movd      (%%"REGD", %%"REGA", 2), %%mm0  \n\t"  /* copy luma pair */
-	    "  shr       $1, %%"REGA"                    \n\t"  /* div offset by 2 */
-	    "  movd      (%%"REGB", %%"REGA", 2), %%mm1  \n\t"  /* copy UV pair VUVU */
+	    "  movl      16(%%" REGSI"), %%" REGEA"        \n\t"  /* get data offset in pixels, 1st pixel pair */
+	    "  movd      (%%" REGD", %%" REGA", 2), %%mm0  \n\t"  /* copy luma pair */
+	    "  shr       $1, %%" REGA"                    \n\t"  /* div offset by 2 */
+	    "  movd      (%%" REGB", %%" REGA", 2), %%mm1  \n\t"  /* copy UV pair VUVU */
 	    "  psllw     $8, %%mm1                       \n\t"  /* shift out V, keep 0000U0U0 */
 	    
 	    /*  we need to use both even and odd croma from same location - trb 9/2002 */
-	    "  punpckldq (%%"REGB", %%"REGA", 2), %%mm1  \r\n"  /* copy UV pair VUVU  */
+	    "  punpckldq (%%" REGB", %%" REGA", 2), %%mm1  \r\n"  /* copy UV pair VUVU  */
 	    "  psrlw     $8, %%mm1                       \r\n"  /* shift out U0, keep 0V0V 0U0U   */
-	    "  movl      20(%%"REGSI"), %%"REGEA"        \r\n"  /* get data offset in pixels, 2nd pixel pair  */
-	    "  punpckldq (%%"REGD", %%"REGA", 2), %%mm0  \r\n"  /* copy luma pair  */
+	    "  movl      20(%%" REGSI"), %%" REGEA"        \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	    "  punpckldq (%%" REGD", %%" REGA", 2), %%mm0  \r\n"  /* copy luma pair  */
 	    
-	    "  pmaddwd    (%%"REGSI"), %%mm0  \r\n"  /* mult and sum lumas by ctl weights  */
+	    "  pmaddwd    (%%" REGSI"), %%mm0  \r\n"  /* mult and sum lumas by ctl weights  */
 	    "  paddusw    %%mm6, %%mm0        \r\n"  /* round  */
 	    "  psrlw      $8, %%mm0           \r\n"  /* right just 2 luma pixel value 000Y,000Y  */
 	    
-	    "  pmaddwd    8(%%"REGSI"), %%mm1 \r\n"  /* mult and sum chromas by ctl weights */
+	    "  pmaddwd    8(%%" REGSI"), %%mm1 \r\n"  /* mult and sum chromas by ctl weights */
 	    "  paddusw    %%mm6, %%mm1        \r\n"  /* round */
 	    "  pslld      $8, %%mm1           \r\n"  /* shift into low bytes of different words */
 	    "  pand       %%mm7, %%mm1        \r\n"  /* keep only 2 chroma values 0V00,0U00  */
 	    "  por        %%mm1, %%mm0        \r\n"  /* combine luma and chroma, 0V0Y,0U0Y  */
 	    "  packuswb   %%mm0, %%mm0        \r\n"  /* pack all into low dword, xxxxVYUY  */
-	    "  movd       %%mm0, (%%"REGDI")  \n\t"  /* done with 2 pixels */
+	    "  movd       %%mm0, (%%" REGDI")  \n\t"  /* done with 2 pixels */
 
-	    "  lea     24(%%"REGSI"), %%"REGSI"  \n\t"  /* bump to next control bytest */
-	    "  lea      4(%%"REGDI"), %%"REGDI"  \n\t"  /* bump to next output pixel addr */
+	    "  lea     24(%%" REGSI"), %%" REGSI"  \n\t"  /* bump to next control bytest */
+	    "  lea      4(%%" REGDI"), %%" REGDI"  \n\t"  /* bump to next output pixel addr */
 	    
-	    "  loop   "hLoopMMX"b             \n\t"  /* loop for more */
+	    "  loop   " hLoopMMX"b             \n\t"  /* loop for more */
 
 	    "emms              \n\t"
 	    /* done with one line */
 
 #if !defined(__x86_64__)
-	    "mov "_oldbx", %%"REGB" \n\t"
+	    "mov "_oldbx", %%" REGB" \n\t"
 #endif
 	    ::
 	    "m" /*0*/(FPround1), 
@@ -900,30 +900,30 @@
       srcp2 = (y < dst_height-1)  ?  srcp1 + src_pitch  :  srcp1;
 
     __asm__  __volatile__(
-             "movl "_src_row_size", %%"REGC" \n\t"
-	     "shr  $3,         %%"REGC"   \n\t"  /* 8 bytes a time */
-	     "mov  "_srcp1",   %%"REGSI"  \n\t"  /* top of 2 src lines to get */
-	     "mov  "_srcp2",   %%"REGD"   \n\t"  /* next "  */ 
-	     "mov  "_vWorkYW", %%"REGDI"  \n\t"  /* luma work destination line */
-	     "xor  %%"REGA",   %%"REGA"   \n\t"
+             "movl "_src_row_size", %%" REGC" \n\t"
+	     "shr  $3,         %%" REGC"   \n\t"  /* 8 bytes a time */
+	     "mov  "_srcp1",   %%" REGSI"  \n\t"  /* top of 2 src lines to get */
+	     "mov  "_srcp2",   %%" REGD"   \n\t"  /* next "  */ 
+	     "mov  "_vWorkYW", %%" REGDI"  \n\t"  /* luma work destination line */
+	     "xor  %%" REGA",   %%" REGA"   \n\t"
 #if !defined(__x86_64__)
 	     /* Let's check here to see if we are on a P4 or higher and can use SSE2 instructions.
 	      * This first loop is not the performance bottleneck anyway but it is trivial to tune
 	      * using SSE2 if we have proper alignment.
 	      */
 	     "testl $1, "_SSE2enabledW"   \n\t"  /* is SSE2 supported? */
-	     "jz    "vMaybeSSEMMX"f       \n\t"  /* n, can't do anyway */
+	     "jz    " vMaybeSSEMMX"f       \n\t"  /* n, can't do anyway */
 #endif
-	     "cmpl  $2, %%"REGC"          \n\t"  /* we have at least 16 byts, 2 qwords? */
-	     "jl    "vMaybeSSEMMX"f       \n\t"  /* n, don't bother */
+	     "cmpl  $2, %%" REGC"          \n\t"  /* we have at least 16 byts, 2 qwords? */
+	     "jl    " vMaybeSSEMMX"f       \n\t"  /* n, don't bother */
 	     
-	     "mov   %%"REGSI", %%"REGB"   \n\t"
-	     "or    %%"REGD",  %%"REGB"   \n\t"
-	     "test  $15,       %%"REGB"   \n\t"  /* both src rows 16 byte aligned? */
-	     "jnz   "vMaybeSSEMMX"f       \n\t"  /* n, don't use sse2 */
+	     "mov   %%" REGSI", %%" REGB"   \n\t"
+	     "or    %%" REGD",  %%" REGB"   \n\t"
+	     "test  $15,       %%" REGB"   \n\t"  /* both src rows 16 byte aligned? */
+	     "jnz   " vMaybeSSEMMX"f       \n\t"  /* n, don't use sse2 */
 			 
-	     "shr   $1, %%"REGC"          \n\t"  /* do 16 bytes at a time instead */
-	     "dec   %%"REGC"              \n\t"  /* jigger loop ct */
+	     "shr   $1, %%" REGC"          \n\t"  /* do 16 bytes at a time instead */
+	     "dec   %%" REGC"              \n\t"  /* jigger loop ct */
 			 
 	     "movdqu "_FPround1", %%xmm0  \n\t"
 	     "movdqu "_vWeight1", %%xmm5  \n\t"
@@ -931,15 +931,15 @@
 	     "pxor        %%xmm7, %%xmm7  \n"
 
 	     ".align 16                   \n"
-	     ""vLoopSSE2_Fetch":          \n\t"
+	     "" vLoopSSE2_Fetch":          \n\t"
 #ifdef PREFETCH
-	     "  prefetcht0 16(%%"REGSI", %%"REGA", 2) \n\t"
-	     "  prefetcht0 16(%%"REGD",  %%"REGA", 2) \n"
+	     "  prefetcht0 16(%%" REGSI", %%" REGA", 2) \n\t"
+	     "  prefetcht0 16(%%" REGD",  %%" REGA", 2) \n"
 #endif
-	     ""vLoopSSE2":  \n\t"
+	     "" vLoopSSE2":  \n\t"
 	     /* we're already checked pointers to be on dqword aligned */
-	     "  movdqa  (%%"REGSI", %%"REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
-	     "  movdqa  (%%"REGD",  %%"REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
+	     "  movdqa  (%%" REGSI", %%" REGA"), %%xmm1 \n\t" /* top of 2 lines to interpolate */
+	     "  movdqa  (%%" REGD",  %%" REGA"), %%xmm3 \n\t" /* 2nd of 2 lines */
 	     "  movdqa    %%xmm1, %%xmm2  \n\t"
 	     "  movdqa    %%xmm3, %%xmm4  \n\t"
 
@@ -964,34 +964,34 @@
 
 	     "  packuswb  %%xmm2, %%xmm1  \n\t"  /* pack words to our 16 byte answer */
 #ifdef STREAMING_STORE_TMP
-	     "  movntdq   %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+	     "  movntdq   %%xmm1, (%%" REGDI", %%" REGA") \n\t" /* save lumas in our work area */
 #else
-	     "  movdqu    %%xmm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+	     "  movdqu    %%xmm1, (%%" REGDI", %%" REGA") \n\t" /* save lumas in our work area */
 #endif			 
-	     "  lea   16(%%"REGA"), %%"REGA" \n\t"
-	     "  decl  %%"REGC"            \n\t"
+	     "  lea   16(%%" REGA"), %%" REGA" \n\t"
+	     "  decl  %%" REGC"            \n\t"
 
-	     "  jg    "vLoopSSE2_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
-	     "  jz    "vLoopSSE2"b        \n\t"  /* or just loop, or not  */
+	     "  jg    " vLoopSSE2_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
+	     "  jz    " vLoopSSE2"b        \n\t"  /* or just loop, or not  */
 
 	     /* done with our SSE2 fortified loop but we may need to pick up the spare change */
 #ifdef STREAMING_STORE_TMP
 	     "  sfence                  \n\t"
 #endif
-	     "  movl  "_src_row_size", %%"REGC" \n\t"  /* get count again   */
-	     "  andl  $15, %%"REGC"       \n\t"  /* just need mod 16  */
+	     "  movl  "_src_row_size", %%" REGC" \n\t"  /* get count again   */
+	     "  andl  $15, %%" REGC"       \n\t"  /* just need mod 16  */
 	     "  movq "_vWeight1", %%mm5   \n\t"
 	     "  movq "_vWeight2", %%mm6   \n\t"
 	     "  movq "_FPround1", %%mm0   \n\t"  /* useful rounding constant  */
 
-	     "  shrl  $3, %%"REGC"        \n\t"  /* 8 bytes at a time, any?  */
-	     "  jz   "MoreSpareChange"f   \n"    /* n, did them all  */
+	     "  shrl  $3, %%" REGC"        \n\t"  /* 8 bytes at a time, any?  */
+	     "  jz   " MoreSpareChange"f   \n"    /* n, did them all  */
 
 	     /* Let's check here to see if we are on a P2 or Athlon and can use SSEMMX instructions.
 	      * This first loop is not the performance bottleneck anyway but it is trivial to tune
 	      * using SSE if we have proper alignment.
 	      */
-	     ""vMaybeSSEMMX":             \n\t"
+	     "" vMaybeSSEMMX":             \n\t"
 
 	     "  movq "_vWeight1", %%mm5   \n\t"  
 	     "  movq "_vWeight2", %%mm6   \n\t"  
@@ -999,20 +999,20 @@
 	     "  pxor       %%mm7, %%mm7   \n\t"
 #if !defined(__x86_64__)
 	     "  testl $1, "_SSEMMXenabledW" \n\t"/* MMXEXTsupported? */
-	     "  jz    "vLoopMMX"f         \n\t"  /* n, can't do anyway */
+	     "  jz    " vLoopMMX"f         \n\t"  /* n, can't do anyway */
 #endif
-	     "  decl  %%"REGC"      \n"  /* jigger loop ctr */
+	     "  decl  %%" REGC"      \n"  /* jigger loop ctr */
 			 
 	     ".align 16             \n"
-	     ""vLoopSSEMMX_Fetch":  \n\t"
+	     "" vLoopSSEMMX_Fetch":  \n\t"
 #ifdef PREFETCH
-	     "  prefetcht0 8(%%"REGSI", %%"REGA")  \n\t"
-	     "  prefetcht0 8(%%"REGD",  %%"REGA")  \n"
+	     "  prefetcht0 8(%%" REGSI", %%" REGA")  \n\t"
+	     "  prefetcht0 8(%%" REGD",  %%" REGA")  \n"
 #endif
-	     ""vLoopSSEMMX":   \n\t"
+	     "" vLoopSSEMMX":   \n\t"
 
-	     "  movq    (%%"REGSI", %%"REGA"), %%mm1  \n\t"   /* top of 2 lines to interpolate */
-	     "  movq    (%%"REGD",  %%"REGA"), %%mm3  \n\t"   /* 2nd of 2 lines    */
+	     "  movq    (%%" REGSI", %%" REGA"), %%mm1  \n\t"   /* top of 2 lines to interpolate */
+	     "  movq    (%%" REGD",  %%" REGA"), %%mm3  \n\t"   /* 2nd of 2 lines    */
 
 	     "  movq      %%mm1, %%mm2  \n\t"
 	     "  movq      %%mm3, %%mm4  \n\t"
@@ -1038,25 +1038,25 @@
 
 	     "  packuswb  %%mm2, %%mm1  \n\t"  /* pack words to our 8 byte answer */
 #ifdef STREAMING_STORE_TMP
-	     "  movntq    %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+	     "  movntq    %%mm1, (%%" REGDI", %%" REGA") \n\t" /* save lumas in our work area */
 #else
-	     "  movq      %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+	     "  movq      %%mm1, (%%" REGDI", %%" REGA") \n\t" /* save lumas in our work area */
 #endif
-	     "  lea   8(%%"REGA"), %%"REGA" \n\t"
-	     "  decl  %%"REGC"              \n\t"
+	     "  lea   8(%%" REGA"), %%" REGA" \n\t"
+	     "  decl  %%" REGC"              \n\t"
 
-	     "  jg    "vLoopSSEMMX_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
-	     "  jz    "vLoopSSEMMX"b        \n\t"  /* or just loop, or not  */
+	     "  jg    " vLoopSSEMMX_Fetch"b  \n\t"  /* if not on last one loop, prefetch  */
+	     "  jz    " vLoopSSEMMX"b        \n\t"  /* or just loop, or not  */
 #ifdef STREAMING_STORE_TMP
 	     "  sfence                      \n\t"
 #endif
-	     "  jmp    "MoreSpareChange"f   \n"    /* all done with vertical  */
+	     "  jmp    " MoreSpareChange"f   \n"    /* all done with vertical  */
 
 	     ".align 16        \n"
-	     ""vLoopMMX":      \n\t"
+	     "" vLoopMMX":      \n\t"
 
-	     "  movq    (%%"REGSI", %%"REGA"), %%mm1  \n\t"  /* top of 2 lines to interpolate */
-	     "  movq    (%%"REGD",  %%"REGA"), %%mm3  \n\t"  /* 2nd of 2 lines    */
+	     "  movq    (%%" REGSI", %%" REGA"), %%mm1  \n\t"  /* top of 2 lines to interpolate */
+	     "  movq    (%%" REGD",  %%" REGA"), %%mm3  \n\t"  /* 2nd of 2 lines    */
 
 	     "  movq      %%mm1, %%mm2  \n\t"
 	     "  movq      %%mm3, %%mm4  \n\t"
@@ -1081,84 +1081,84 @@
 	     "  psrlw     $8, %%mm2     \n\t"  /* right adjust luma */
 
 	     "  packuswb  %%mm2, %%mm1  \n\t"  /* pack words to our 8 byte answer */
-	     "  movq      %%mm1, (%%"REGDI", %%"REGA") \n\t" /* save lumas in our work area */
+	     "  movq      %%mm1, (%%" REGDI", %%" REGA") \n\t" /* save lumas in our work area */
 			 
-	     "  lea   8(%%"REGA"), %%"REGA" \n\t"
-	     "  loop  "vLoopMMX"b  \n"
+	     "  lea   8(%%" REGA"), %%" REGA" \n\t"
+	     "  loop  " vLoopMMX"b  \n"
 
 	     /* Add a little code here to check if we have more pixels to do and, if so, make one
 	      * more pass thru vLoopMMX. We were processing in multiples of 8 pixels and alway have
 	      * an even number so there will never be more than 7 left.
 	      */
-	     ""MoreSpareChange":     \n\t"
+	     "" MoreSpareChange":     \n\t"
 
-	     "  cmpl "_src_row_size", %%"REGEA"  \n\t"  /* did we get them all */
-	     "  jnl  "DoHorizontal"f  \n\t"  /* yes, else have 2 left */
-	     "  movl $1, %%"REGC"     \n\t"  /* jigger loop ct */
-	     "  movl "_src_row_size", %%"REGEA"  \n\t"
-	     "  sub  $8, %%"REGA"     \n\t"  /* back up to last 8 pixels */
-	     "  jmp  "vLoopMMX"b      \n"
+	     "  cmpl "_src_row_size", %%" REGEA"  \n\t"  /* did we get them all */
+	     "  jnl  " DoHorizontal"f  \n\t"  /* yes, else have 2 left */
+	     "  movl $1, %%" REGC"     \n\t"  /* jigger loop ct */
+	     "  movl "_src_row_size", %%" REGEA"  \n\t"
+	     "  sub  $8, %%" REGA"     \n\t"  /* back up to last 8 pixels */
+	     "  jmp  " vLoopMMX"b      \n"
 
 	     /*  We've taken care of the vertical scaling, now do horizontal  */
-	     ""DoHorizontal":        \n\t"
+	     "" DoHorizontal":        \n\t"
 	     "  pxor        %%mm7, %%mm7     \n\t"
 	     "  movq  "_FPround2", %%mm6     \n\t"  /* useful rounding constant, dwords  */
-	     "  mov   "_pControl", %%"REGSI" \n\t"  /* @ horiz control bytes  */	
-	     "  movl  "_row_size", %%"REGC"  \n\t"
-	     "  shrl  $2, %%"REGC"          \n\t"  /* 4 bytes a time, 4 pixels  */
-	     "  mov   "_vWorkYW",  %%"REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
-	     "  mov   "_dstp",     %%"REGDI" \n\t"  /* the destination line  */
+	     "  mov   "_pControl", %%" REGSI" \n\t"  /* @ horiz control bytes  */	
+	     "  movl  "_row_size", %%" REGC"  \n\t"
+	     "  shrl  $2, %%" REGC"          \n\t"  /* 4 bytes a time, 4 pixels  */
+	     "  mov   "_vWorkYW",  %%" REGD"  \n\t"  /* our luma data, as 0Y0Y 0Y0Y..  */
+	     "  mov   "_dstp",     %%" REGDI" \n\t"  /* the destination line  */
 #if !defined(__x86_64__)
 	     "  testl $1, "_SSEMMXenabledW" \n\t"  /* MMXEXTsupported? */
-	     "  jz    "hLoopMMX"f           \n\t"  /* n, can't do anyway */
+	     "  jz    " hLoopMMX"f           \n\t"  /* n, can't do anyway */
 #endif
 	     /* With SSE support we will make 8 pixels (from 8 pairs) at a time */
-	     "  shrl  $1, %%"REGC"  \n\t"  /* 8 bytes a time instead of 4  */
-	     "  jz    "LessThan8"f  \n"
+	     "  shrl  $1, %%" REGC"  \n\t"  /* 8 bytes a time instead of 4  */
+	     "  jz    " LessThan8"f  \n"
 
 	     ".align 16          \n"
-	     ""hLoopMMXSSE":    \n\t"
+	     "" hLoopMMXSSE":    \n\t"
 
 
 	     /* handle first 2 pixels */
 	     /* phi: must use movl here (x86_64, reading from table of uint_32's) */
-	     "  movl   16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
-	     "  movl   20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	     "  movl   16(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movl   20(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
 
-	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%" REGD", %%" REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm0      \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
-	     "  movl      16+24(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
-	     "  movl      20+24(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
-	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  movl      16+24(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
+	     "  movl      20+24(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
+	     "  pmaddwd   (%%" REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
 	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 
 	     /* handle 3rd and 4th pixel pairs */
-	     "  movd      (%%"REGD", %%"REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%" REGD", %%" REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm1        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
-	     "  movl      16+48(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 5st pixel pair */
-	     "  movl      20+48(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 6nd pixel pair  */
-	     "  pmaddwd 24(%%"REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  movl      16+48(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 5st pixel pair */
+	     "  movl      20+48(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 6nd pixel pair  */
+	     "  pmaddwd 24(%%" REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm1       \n\t"  /* round */
 	     "  psrlw         $8, %%mm1       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 
 	     /* handle 5th and 6th pixel pairs */
-	     "  movd      (%%"REGD", %%"REGA"), %%mm2  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm2  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm2  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%" REGD", %%" REGB"), %%mm2  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm2        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
-	     "  movl      16+72(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 7st pixel pair */
-	     "  movl      20+72(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 8nd pixel pair  */
-	     "  pmaddwd 48(%%"REGSI"), %%mm2  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  movl      16+72(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 7st pixel pair */
+	     "  movl      20+72(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 8nd pixel pair  */
+	     "  pmaddwd 48(%%" REGSI"), %%mm2  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm2       \n\t"  /* round */
 	     "  psrlw         $8, %%mm2       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 
 	     /* handle 7th and 8th pixel pairs */
-	     "  movd      (%%"REGD", %%"REGA"), %%mm3  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm3  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm3  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%" REGD", %%" REGB"), %%mm3  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm3        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
-	     "  pmaddwd 72(%%"REGSI"), %%mm3  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  pmaddwd 72(%%" REGSI"), %%mm3  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm3       \n\t"  /* round */
 	     "  psrlw         $8, %%mm3       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 
@@ -1167,99 +1167,99 @@
 	     "  packuswb %%mm3, %%mm2         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
 	     "  packuswb %%mm2, %%mm0         \n\t"  /* and again into  YYYYYYYY */			
 #ifdef STREAMING_STORE
-	     "  movntq   %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
+	     "  movntq   %%mm0, (%%" REGDI")   \n\t"  /* done with 4 pixels */
 #else
-	     "  movq     %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
+	     "  movq     %%mm0, (%%" REGDI")   \n\t"  /* done with 4 pixels */
 #endif
 
-	     "  lea  96(%%"REGSI"), %%"REGSI" \n\t"
-	     "  lea   8(%%"REGDI"), %%"REGDI" \n\t"
-	     "  decl  %%"REGC"                \n\t"
-	     "  jg    "hLoopMMXSSE"b    \n\t"   /* loop for more  */
+	     "  lea  96(%%" REGSI"), %%" REGSI" \n\t"
+	     "  lea   8(%%" REGDI"), %%" REGDI" \n\t"
+	     "  decl  %%" REGC"                \n\t"
+	     "  jg    " hLoopMMXSSE"b    \n\t"   /* loop for more  */
 #ifdef STREAMING_STORE
 	     "  sfence                  \n"
 #endif
-	     ""LessThan8":    \n\t"
-	     "  movl "_row_size", %%"REGC"  \n\t"
-	     "  andl          $7, %%"REGC"  \n\t"  /* we have done all but maybe this */
-	     "  shrl          $2, %%"REGC"  \n\t"  /* now do only 4 bytes at a time */
-	     "  jz            "LessThan4"f  \n"
+	     "" LessThan8":    \n\t"
+	     "  movl "_row_size", %%" REGC"  \n\t"
+	     "  andl          $7, %%" REGC"  \n\t"  /* we have done all but maybe this */
+	     "  shrl          $2, %%" REGC"  \n\t"  /* now do only 4 bytes at a time */
+	     "  jz            " LessThan4"f  \n"
 
 	     ".align 16   \n"
-	     ""hLoopMMX":    \n\t"
+	     "" hLoopMMX":    \n\t"
 
 	     /* handle first 2 pixels */
-	     "  movl   16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
-	     "  movl   20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
-	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movl   16(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movl   20(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%" REGD", %%" REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm0      \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
-	     "  movl      16+24(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
-	     "  movl      20+24(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
-	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  movl      16+24(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 3st pixel pair */
+	     "  movl      20+24(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 4nd pixel pair  */
+	     "  pmaddwd   (%%" REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
 	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 
 	     /* handle 3rd and 4th pixel pairs */
-	     "  movd      (%%"REGD", %%"REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpckldq (%%"REGD", %%"REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm1  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpckldq (%%" REGD", %%" REGB"), %%mm1  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm1        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
-	     "  pmaddwd 24(%%"REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  pmaddwd 24(%%" REGSI"), %%mm1  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm1       \n\t"  /* round */
 	     "  psrlw         $8, %%mm1       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 
 	     /* combine, store, and loop */
 	     "  packuswb %%mm1, %%mm0         \n\t"  /* pack into qword, 0Y0Y0Y0Y */
 	     "  packuswb %%mm7, %%mm0         \n\t"  /* and again into  0000YYYY */			
-	     "  movd     %%mm0, (%%"REGDI")   \n\t"  /* done with 4 pixels */
-	     "  lea  48(%%"REGSI"), %%"REGSI" \n\t"
-	     "  lea   4(%%"REGDI"), %%"REGDI" \n\t"
+	     "  movd     %%mm0, (%%" REGDI")   \n\t"  /* done with 4 pixels */
+	     "  lea  48(%%" REGSI"), %%" REGSI" \n\t"
+	     "  lea   4(%%" REGDI"), %%" REGDI" \n\t"
 
-	     "  loop   "hLoopMMX"b            \n"    /* loop for more */
+	     "  loop   " hLoopMMX"b            \n"    /* loop for more */
 		 
 	     /* test to see if we have a mod 4 size row, if not then more spare change */
-	     ""LessThan4":    \n\t"
-	     "  movl "_row_size", %%"REGC"    \n\t"
-	     "  andl          $3, %%"REGC"    \n\t"  /* remainder side mod 4 */
-	     "  cmpl          $2, %%"REGC"    \n\t"  
-	     "  jl            "LastOne"f      \n\t"  /* none, none */
+	     "" LessThan4":    \n\t"
+	     "  movl "_row_size", %%" REGC"    \n\t"
+	     "  andl          $3, %%" REGC"    \n\t"  /* remainder side mod 4 */
+	     "  cmpl          $2, %%" REGC"    \n\t"  
+	     "  jl            " LastOne"f      \n\t"  /* none, none */
 
 	     /* handle 2 more pixels */
-	     "  movl      16(%%"REGSI"), %%"REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
-	     "  movl      20(%%"REGSI"), %%"REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
-	     "  movd      (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
-	     "  punpcklwd (%%"REGD", %%"REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
+	     "  movl      16(%%" REGSI"), %%" REGEA"  \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movl      20(%%" REGSI"), %%" REGEB"  \r\n"  /* get data offset in pixels, 2nd pixel pair  */
+	     "  movd      (%%" REGD", %%" REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "  punpcklwd (%%" REGD", %%" REGB"), %%mm0  \r\n"  /* 2nd luma pair, now xxxxYYYY  */
 	     "  punpcklbw %%mm7, %%mm0        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
 
-	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  pmaddwd   (%%" REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw   %%mm6, %%mm0        \n\t"  /* round */
 	     "  psrlw        $8, %%mm0        \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
 	     "  packuswb  %%mm7, %%mm0        \n\t"  /* pack into qword, 00000Y0Y */
 	     "  packuswb  %%mm7, %%mm0        \n\t"  /* and again into  000000YY */			
-	     "  movd      %%mm0, (%%"REGDI")  \n\t"  /* store, we are guarrenteed room in buffer (8 byte mult) */
-	     "  subl         $2, %%"REGC"     \n\t"  
+	     "  movd      %%mm0, (%%" REGDI")  \n\t"  /* store, we are guarrenteed room in buffer (8 byte mult) */
+	     "  subl         $2, %%" REGC"     \n\t"  
 	     
-	     "  lea  24(%%"REGSI"), %%"REGSI" \n\t" /* bump to next control bytes */
-	     "  lea   2(%%"REGDI"), %%"REGDI" \n" /* bump to next output pixel addr */
+	     "  lea  24(%%" REGSI"), %%" REGSI" \n\t" /* bump to next control bytes */
+	     "  lea   2(%%" REGDI"), %%" REGDI" \n" /* bump to next output pixel addr */
 
 	     /* maybe one last pixel */
-	     ""LastOne":   \n\t"
-	     "  cmpl   $0, %%"REGC"   \r\n"  /* still more ? */
-	     "  jz     "AllDone"f     \r\n"  /* n, done */
-	     "  movl   16(%%"REGSI"), %%"REGEA"     \n\t"  /* get data offset in pixels, 1st pixel pair */
-	     "  movd   (%%"REGD", %%"REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
+	     "" LastOne":   \n\t"
+	     "  cmpl   $0, %%" REGC"   \r\n"  /* still more ? */
+	     "  jz     " AllDone"f     \r\n"  /* n, done */
+	     "  movl   16(%%" REGSI"), %%" REGEA"     \n\t"  /* get data offset in pixels, 1st pixel pair */
+	     "  movd   (%%" REGD", %%" REGA"), %%mm0  \n\t"  /* copy luma pair 0000xxYY */
 	     "  punpcklbw %%mm7, %%mm0        \n\t"  /* make words out of bytes, 0Y0Y0Y0Y */
 
-	     "  pmaddwd   (%%"REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
+	     "  pmaddwd   (%%" REGSI"), %%mm0  \n\t"  /* mult and sum lumas by ctl weights */
 	     "  paddusw    %%mm6, %%mm0       \n\t"  /* round */
 	     "  psrlw         $8, %%mm0       \n\t"  /* right just 4 luma pixel value 0Y0Y0Y0Y */
-	     "  movd       %%mm0, %%"REGEA"   \n\t"
-	     "  movb        %%al, (%%"REGDI") \n"    /* store last one */
+	     "  movd       %%mm0, %%" REGEA"   \n\t"
+	     "  movb        %%al, (%%" REGDI") \n"    /* store last one */
 			 
-	     ""AllDone":  \n\t"
+	     "" AllDone":  \n\t"
 	     "  emms      \n\t"
 #if !defined(__x86_64__)
-	     "mov "_oldbx", %%"REGB" \n\t"
+	     "mov "_oldbx", %%" REGB" \n\t"
 #endif
 	     ::
 	     "m" /*0*/(FPround1),
