pipeline-parallelism-with-controllable-memory

Running

App Files Files Community

Nyamdavaa Amar commited on Jun 14, 2024

Commit

cf49f13

1 Parent(s): 3d4d40d

Edit presets

Browse files

Files changed (7) hide show

adaptive_schedule.py +25 -50
app.py +18 -16
description1.md +4 -0
description2.md +1 -0
description3.md +0 -0
interleaved_variant.py +9 -14
schedule1f1bv.py +12 -12

adaptive_schedule.py CHANGED Viewed

@@ -46,9 +46,9 @@ def transform_schedule(schedule, f, b, w, c):
         time = 0
         if (stage, type, mb) in local_prev:
             time = get_time(stage, *local_prev[(stage, type, mb)])
-        if type in ('F', 'B') and stage > 0:
             time = max(time, get_time(stage - 1, type, mb) + c)
-        if type in ('f', 'b') and stage + 1< len(schedule):
             time = max(time, get_time(stage + 1, type, mb) + c)
         # print(f'{stage} {type}:{mb}', time + cost[type])
         time_map[(stage, type, mb)] = time + cost[type]
@@ -63,7 +63,7 @@ def transform_schedule(schedule, f, b, w, c):
         for p, mb in stage:
             result_stage.append(ScheduledNode(
                 p.upper(),
-                p in ('f', 'B', 'W'),
                 sid,
                 mb,
                 get_time(sid, p, mb) - cost[p],
@@ -110,9 +110,9 @@ def evaluate_schedule(schedule, f, b, w, c):
         time = 0
         if (stage, type, mb) in local_prev:
             time = get_time(stage, *local_prev[(stage, type, mb)])
-        if type in ('F', 'B') and stage > 0:
             time = max(time, get_time(stage - 1, type, mb) + c)
-        if type in ('f', 'b') and stage + 1< len(schedule):
             time = max(time, get_time(stage + 1, type, mb) + c)
         # print(f'{stage} {type}:{mb}', time + cost[type])
         time_map[(stage, type, mb)] = time + cost[type]
@@ -153,16 +153,6 @@ def get_peak_mem(schedules, return_all=False):
         return all_peak
     return max_peak
-debug = False
-def print_schedules(schedules):
-    if not debug:
-        return
-    for seq in schedules:
-        _str = ""
-        for v in seq:
-            _str += v
-        print(_str)
 def calc_bubble(schedules):
     stage_bubbles = []
@@ -199,8 +189,8 @@ def clear_invalid(repeated, stage, pos, offset=-1):
 def clear_invalid_index(repeated, m):
     p = len(repeated)
     index = pattern_size
-    for identifier in ['F', 'f', 'B', 'b']:
-        if identifier in ['F', 'B']:
             _iter = range(p)
         else:
             _iter = range(p - 1, -1, -1)
@@ -210,7 +200,7 @@ def clear_invalid_index(repeated, m):
                     clear_invalid(repeated, i, index - pattern_size, offset=-1)
                     clear_invalid(repeated, i, index + pattern_size * m, offset=1)
                     index += 1
-                    if identifier in ['B', 'b']:
                         w_identifier = {'B': 'W', 'b': 'w'}[identifier]
                         for k in range(pattern_size):
                             if repeated[i][index + k] == w_identifier:
@@ -386,6 +376,17 @@ def squeeze_without_change_order(schedules, m):
                 identifier_cnt[i][identifier] += 1
                 identifier_index[_cnt * p + i][identifier] = index
                 stage_index[i] = index + 1
     return squeezed
@@ -485,12 +486,11 @@ def process_cooldown(schedules, m):
     return schedules
-def schedule_by_pattern(p, m, patterns):
     schedules = init_repeated_schedule(p, max(m, 2 * p), patterns)
     schedules = clear_invalid_index(schedules,  max(m, 2 * p))
-    print_schedules(schedules)
     init_peak_mem = get_peak_mem(schedules)
-    if init_peak_mem > 2 * p:
         return None, init_peak_mem, [6 *  max(m, 2 * p)] * p
     schedules = process_warmup_without_increasing_peak_mem(schedules,  max(m, 2 * p))
@@ -503,20 +503,16 @@ def schedule_by_pattern(p, m, patterns):
                 schedules[sid][i] = ' '
             else:
                 cnt[schedules[sid][i]] += 1
-    print_schedules(schedules)
     peak_mem = get_peak_mem(schedules)
     if peak_mem > init_peak_mem:
         return None, init_peak_mem, [6 * m] * p
     schedules = squeeze_without_change_order(schedules, m)
-    print_schedules(schedules)
     schedules = process_cooldown(schedules, m)
-    print_schedules(schedules)
     peak_mem = get_peak_mem(schedules)
     if peak_mem > init_peak_mem:
         return None, init_peak_mem, [6 * m] * p
     stage_bubbles = calc_bubble(schedules)
     return schedules, peak_mem, stage_bubbles
@@ -572,25 +568,8 @@ def schedule(p, m, cost, max_mem):
                 pattern = [0, ff_i, b_i, bb_i, -1, -1]
                 pattern = fill_w_in_pattern(pattern)
                 available_patterns.append(pattern)
-    available_offsets = []
-    for f_o in range(1, pattern_size + 1):
-        for ff_o in range(1, pattern_size + 1):
-            for b_o in range(1, pattern_size + 1):
-                if f_o != b_o:
-                    continue
-                bb_o = ff_o + b_o - f_o
-                if bb_o < 1 or bb_o > pattern_size:
-                    continue
-                if bb_o + ff_o + b_o + f_o > 2 * pattern_size:
-                    continue
-                # if bb_o + ff_o + b_o + f_o != 6:
-                #     continue
-                offset = [f_o, - ff_o, b_o, - bb_o]
-                if min(ff_o, bb_o) > 1:
-                    continue
-                available_offsets.append(offset)
-    print(available_offsets, len(available_patterns))
     available_offsets = [
         [1, -1, 1, -1],
         [2, -1, 2, -1],
@@ -601,7 +580,6 @@ def schedule(p, m, cost, max_mem):
     best_schedule = None
     best_bubble = None
-    peak_mem2min_bubble = {}
     for pattern_0 in available_patterns:
         for i_0 in range(len(available_offsets)):
             for i_1 in range(i_0 + 1):
@@ -611,13 +589,10 @@ def schedule(p, m, cost, max_mem):
                     whole_pattern = get_whole_pattern(pattern_0, offset_0, offset_1, len_0, p)
                     if whole_pattern is None:
                         continue
-                    # for pattern in whole_pattern:
-                    #     print(get_pattern_str(pattern))
-                    # print(offset)
-                    s, peak_mem, bubbles = schedule_by_pattern(p, m, whole_pattern)
-                    if s is None:
-                        continue
                     if peak_mem > 2 * p or peak_mem > max_mem:
                         continue
                     max_bubble = max(bubbles)
                     max_bubble = evaluate_schedule(s, *cost)

         time = 0
         if (stage, type, mb) in local_prev:
             time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in "FB" and stage > 0:
             time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in "fb" and stage + 1< len(schedule):
             time = max(time, get_time(stage + 1, type, mb) + c)
         # print(f'{stage} {type}:{mb}', time + cost[type])
         time_map[(stage, type, mb)] = time + cost[type]
         for p, mb in stage:
             result_stage.append(ScheduledNode(
                 p.upper(),
+                p in "fBW",
                 sid,
                 mb,
                 get_time(sid, p, mb) - cost[p],
         time = 0
         if (stage, type, mb) in local_prev:
             time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in "FB" and stage > 0:
             time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in "fb" and stage + 1< len(schedule):
             time = max(time, get_time(stage + 1, type, mb) + c)
         # print(f'{stage} {type}:{mb}', time + cost[type])
         time_map[(stage, type, mb)] = time + cost[type]
         return all_peak
     return max_peak
 def calc_bubble(schedules):
     stage_bubbles = []
 def clear_invalid_index(repeated, m):
     p = len(repeated)
     index = pattern_size
+    for identifier in "FfBb":
+        if identifier in "FB":
             _iter = range(p)
         else:
             _iter = range(p - 1, -1, -1)
                     clear_invalid(repeated, i, index - pattern_size, offset=-1)
                     clear_invalid(repeated, i, index + pattern_size * m, offset=1)
                     index += 1
+                    if identifier in "Bb":
                         w_identifier = {'B': 'W', 'b': 'w'}[identifier]
                         for k in range(pattern_size):
                             if repeated[i][index + k] == w_identifier:
                 identifier_cnt[i][identifier] += 1
                 identifier_index[_cnt * p + i][identifier] = index
                 stage_index[i] = index + 1
+    while True:
+        if(len(squeezed[0]) == 1):
+            break
+        allempty = True
+        for x in squeezed:
+            if x[-1] != ' ':
+                allempty = False
+        if allempty == False:
+            break
+        for x in squeezed:
+            del x[-1]
     return squeezed
     return schedules
+def schedule_by_pattern(p, m, patterns, max_mem):
     schedules = init_repeated_schedule(p, max(m, 2 * p), patterns)
     schedules = clear_invalid_index(schedules,  max(m, 2 * p))
     init_peak_mem = get_peak_mem(schedules)
+    if init_peak_mem > max_mem:
         return None, init_peak_mem, [6 *  max(m, 2 * p)] * p
     schedules = process_warmup_without_increasing_peak_mem(schedules,  max(m, 2 * p))
                 schedules[sid][i] = ' '
             else:
                 cnt[schedules[sid][i]] += 1
     peak_mem = get_peak_mem(schedules)
     if peak_mem > init_peak_mem:
         return None, init_peak_mem, [6 * m] * p
     schedules = squeeze_without_change_order(schedules, m)
     schedules = process_cooldown(schedules, m)
     peak_mem = get_peak_mem(schedules)
     if peak_mem > init_peak_mem:
         return None, init_peak_mem, [6 * m] * p
     stage_bubbles = calc_bubble(schedules)
     return schedules, peak_mem, stage_bubbles
                 pattern = [0, ff_i, b_i, bb_i, -1, -1]
                 pattern = fill_w_in_pattern(pattern)
                 available_patterns.append(pattern)
+    print(len(available_patterns))
     available_offsets = [
         [1, -1, 1, -1],
         [2, -1, 2, -1],
     best_schedule = None
     best_bubble = None
     for pattern_0 in available_patterns:
         for i_0 in range(len(available_offsets)):
             for i_1 in range(i_0 + 1):
                     whole_pattern = get_whole_pattern(pattern_0, offset_0, offset_1, len_0, p)
                     if whole_pattern is None:
                         continue
+                    s, peak_mem, bubbles = schedule_by_pattern(p, m, whole_pattern, min(2 * p, max_mem))
                     if peak_mem > 2 * p or peak_mem > max_mem:
+                        break
+                    if s is None:
                         continue
                     max_bubble = max(bubbles)
                     max_bubble = evaluate_schedule(s, *cost)

app.py CHANGED Viewed

@@ -136,9 +136,10 @@ with gr.Blocks() as demo:
   gr.Markdown(open("description1.md").read())
   gr.Markdown("# Pipeline Scheduler Playground")
   presets = {
-    'Real Case': (6, 12, 1049, 1122, 903, 79, 'V-Half'),
-    'Ideal Case': (6, 12, 20, 20, 20, 0, 'V-Min'),
-    'Zero Bubble Case': (6, 12, 1049, 1122, 903, 79, 'V-ZB')
   }
   preset_buttons = {}
@@ -153,30 +154,30 @@ with gr.Blocks() as demo:
       with gr.Group():
         gr.Markdown("Basic Parameters")
         with gr.Row():
-          p=gr.Number(label="Number of stages (p)", value=6, interactive=True, precision=0)
-          m=gr.Number(label="Number of microbatches (m)", value=12, interactive=True, precision=0)
     with gr.Column(scale=2):
       with gr.Group():
         gr.Markdown("Costs. All costs are used as integers. For chunked schedules, this is the time of two virtual stages on a stage combined.")
-        with gr.Row():
-          f=gr.Number(label="Time of F", value=1049, interactive=True, precision=0)
-          b=gr.Number(label="Time of B", value=1122, interactive=True, precision=0)
-          w=gr.Number(label="Time of W", value=903, interactive=True, precision=0)
-          c=gr.Number(label="Time of one P2P communication", value=79, interactive=True, precision=0)
   with gr.Group():
     gr.Markdown("Activation memory limit.")
     def update_mem(p, s, mem):
       print("update")
       if s == "custom":
         return mem
-      if s == "V-Min":
         return (p + 4) // 3
-      if s == "V-Half":
         return (p + 2) // 2
-      if s == "V-ZB":
         return p
       assert False
-    memsel=gr.Radio(choices=["V-Min", "V-Half", "V-ZB", "custom"], value="V-Half")
     mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For chunked schedules, this is relative to two virtual stages on a stage combined.", value=(p.value + 2) // 2, interactive=True, precision=0)
     memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
@@ -212,7 +213,7 @@ with gr.Blocks() as demo:
       with gr.Column(scale=4):
         schedule1f1bv_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
-    gr.Markdown("Two microbatch in one building block schedule")
     with gr.Row():
       with gr.Column(scale=1):
         type2_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
@@ -221,7 +222,7 @@ with gr.Blocks() as demo:
       with gr.Column(scale=4):
         type2_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
-    gr.Markdown("Interleaved 1F1B Schedule")
     with gr.Row():
       with gr.Column(scale=1):
         interleaved_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
@@ -234,6 +235,7 @@ with gr.Blocks() as demo:
           schedule1f1bv_acceleration, schedule1f1bv_mem, schedule1f1bv_bubble, schedule1f1bv_image,
           type2_acceleration, type2_mem, type2_bubble, type2_image,
           interleaved_acceleration, interleaved_mem, interleaved_bubble, interleaved_image])
   for (k, v) in presets.items():
     def update_preset(pb, p, m, f, b, w, c, mem):

   gr.Markdown(open("description1.md").read())
   gr.Markdown("# Pipeline Scheduler Playground")
   presets = {
+    'Default Case': (4, 10, 100, 110, 90, 5, 'V-Half (1/2)'),
+    'Ideal Case': (4, 10, 20, 20, 20, 0, 'V-Min (1/3)'),
+    'Real Case': (4, 10, 1049, 1122, 903, 79, 'V-Half (1/2)'),
+    'Zero Bubble Case': (4, 10, 1049, 1122, 903, 79, 'V-ZB (1)')
   }
   preset_buttons = {}
       with gr.Group():
         gr.Markdown("Basic Parameters")
         with gr.Row():
+          p=gr.Number(label="Number of stages (p)", value=4, interactive=True, precision=0)
+          m=gr.Number(label="Number of microbatches (m)", value=10, interactive=True, precision=0)
     with gr.Column(scale=2):
       with gr.Group():
         gr.Markdown("Costs. All costs are used as integers. For chunked schedules, this is the time of two virtual stages on a stage combined.")
+        with gr.Row():
+          f=gr.Number(label="Time of F", value=100, interactive=True, precision=0)
+          b=gr.Number(label="Time of B", value=110, interactive=True, precision=0)
+          w=gr.Number(label="Time of W", value=90, interactive=True, precision=0)
+          c=gr.Number(label="Time of one P2P communication", value=5, interactive=True, precision=0)
   with gr.Group():
     gr.Markdown("Activation memory limit.")
     def update_mem(p, s, mem):
       print("update")
       if s == "custom":
         return mem
+      if s == "V-Min (1/3)":
         return (p + 4) // 3
+      if s == "V-Half (1/2)":
         return (p + 2) // 2
+      if s == "V-ZB (1)":
         return p
       assert False
+    memsel=gr.Radio(choices=["V-Min (1/3)", "V-Half (1/2)", "V-ZB (1)", "custom"], value="V-Half (1/2)")
     mem=gr.Number(label="Custom memory limit in terms of pending F on a stage. For chunked schedules, this is relative to two virtual stages on a stage combined.", value=(p.value + 2) // 2, interactive=True, precision=0)
     memsel.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
     p.change(update_mem, inputs=[p, memsel, mem], outputs=mem)
       with gr.Column(scale=4):
         schedule1f1bv_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
+    gr.Markdown("Zero bubble schedule with 2/3 1F1B memory")
     with gr.Row():
       with gr.Column(scale=1):
         type2_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
       with gr.Column(scale=4):
         type2_image=gr.Image(None, interactive=False, label="Schedule Image", show_label=False)
   with gr.Group():
+    gr.Markdown("Variation of Interleaved 1F1B Schedule")
     with gr.Row():
       with gr.Column(scale=1):
         interleaved_acceleration=gr.Textbox("", label="Acceleration compared to 1F1B")
           schedule1f1bv_acceleration, schedule1f1bv_mem, schedule1f1bv_bubble, schedule1f1bv_image,
           type2_acceleration, type2_mem, type2_bubble, type2_image,
           interleaved_acceleration, interleaved_mem, interleaved_bubble, interleaved_image])
+  gr.Markdown(open("description3.md").read())
   for (k, v) in presets.items():
     def update_preset(pb, p, m, f, b, w, c, mem):

description1.md CHANGED Viewed

@@ -1,5 +1,9 @@
 # Pipeline Parallellism with Controllable Memory
 Check out our paper at [Arxiv](https://arxiv.org/abs/2405.15362).
 Bubble Rate here is calculated as (1 - longest stage time/(F+B+W)/m).

 # Pipeline Parallellism with Controllable Memory
+Pipeline Parallelism with Controllable Memory creates a framework on designing pipeline schedules and uses the framework to find memory optimal efficient schedules.
+From our findings, we need approximately 1/3 memory under ideal conditions (F, B and W have same runtime), and 1/2 memory to create zero bubble schedule in realistic scenarios (with the necessary condition being W + 2B ≥ 2F and W + 2F ≥ 2B ).
 Check out our paper at [Arxiv](https://arxiv.org/abs/2405.15362).
 Bubble Rate here is calculated as (1 - longest stage time/(F+B+W)/m).

description2.md CHANGED Viewed

@@ -1,6 +1,7 @@
 ## Alternative schedules
 By utilizing the building block, we can search for different types of schedules depending on the need. We illustrate few of them here below:
 * 1F1B-V schedule without doing any B-W split.
 * Schedule with 2/3rd 1F1B memory by utilising B-W split. Note that two microbatches are included in a single building block to avoid collision.
 * Variation of interleaved 1F1B with lower memory

 ## Alternative schedules
 By utilizing the building block, we can search for different types of schedules depending on the need. We illustrate few of them here below:
 * 1F1B-V schedule without doing any B-W split.
 * Schedule with 2/3rd 1F1B memory by utilising B-W split. Note that two microbatches are included in a single building block to avoid collision.
 * Variation of interleaved 1F1B with lower memory

description3.md ADDED Viewed

File without changes

interleaved_variant.py CHANGED Viewed

@@ -65,11 +65,6 @@ def get_interleaved_variation(_p, _n, cost):
         'B': _b+_w,
         'b': _b+_w
     }
-    pred = {
-        'f': 'F',
-        'B': 'f',
-        'b': 'B'
-    }
     time_map = {}
     def get_time(stage, type, minibatch):
@@ -78,16 +73,16 @@ def get_interleaved_variation(_p, _n, cost):
         time = 0
         if (stage, type, minibatch) in local_prev:
             time = get_time(*local_prev[(stage, type, minibatch)])
-        if stage > 0 and type in ('F', 'f'):
             time = max(time, get_time(stage - 1, type, minibatch) + _c)
-        if stage == 0 and type in ('f'):
-            time = max(time, get_time(_p - 1, pred[type], minibatch) + _c)
-        if stage != _p - 1 and type in ('B', 'b'):
             time = max(time, get_time(stage + 1, type, minibatch) + _c)
-        if stage == _p - 1 and type in ('b'):
-            time = max(time, get_time(0, pred[type], minibatch) + _c)
-        if stage == _p - 1 and type in ('B'):
-            time = max(time, get_time(stage, pred[type], minibatch))
         time_map[(stage, type, minibatch)] = time + cost[type]
         return time_map[(stage, type, minibatch)]
@@ -97,7 +92,7 @@ def get_interleaved_variation(_p, _n, cost):
         for type, minibatch in stage:
             result_stage.append(ScheduledNode(
                 type.upper(),
-                type in ('f', 'B', 'W'),
                 sid,
                 minibatch,
                 get_time(sid, type, minibatch) - cost[type],

         'B': _b+_w,
         'b': _b+_w
     }
     time_map = {}
     def get_time(stage, type, minibatch):
         time = 0
         if (stage, type, minibatch) in local_prev:
             time = get_time(*local_prev[(stage, type, minibatch)])
+        if stage > 0 and type in "Ff":
             time = max(time, get_time(stage - 1, type, minibatch) + _c)
+        if stage == 0 and type == 'f':
+            time = max(time, get_time(_p - 1, 'F', minibatch) + _c)
+        if stage != _p - 1 and type in "Bb":
             time = max(time, get_time(stage + 1, type, minibatch) + _c)
+        if stage == _p - 1 and type == 'b':
+            time = max(time, get_time(0, 'B', minibatch) + _c)
+        if stage == _p - 1 and type == 'B':
+            time = max(time, get_time(stage, 'f', minibatch))
         time_map[(stage, type, minibatch)] = time + cost[type]
         return time_map[(stage, type, minibatch)]
         for type, minibatch in stage:
             result_stage.append(ScheduledNode(
                 type.upper(),
+                type in "fBW",
                 sid,
                 minibatch,
                 get_time(sid, type, minibatch) - cost[type],

schedule1f1bv.py CHANGED Viewed

@@ -44,9 +44,9 @@ def transform_schedule(schedule, f, b, w, c):
         time = 0
         if (stage, type, mb) in local_prev:
             time = get_time(stage, *local_prev[(stage, type, mb)])
-        if type in ('F', 'B') and stage > 0:
             time = max(time, get_time(stage - 1, type, mb) + c)
-        if type in ('f', 'b') and stage + 1< len(schedule):
             time = max(time, get_time(stage + 1, type, mb) + c)
         time_map[(stage, type, mb)] = time + cost[type]
         return time_map[(stage, type, mb)]
@@ -59,7 +59,7 @@ def transform_schedule(schedule, f, b, w, c):
         for p, mb in stage:
             result_stage.append(ScheduledNode(
                 p.upper(),
-                p in ('f', 'B', 'W'),
                 sid,
                 mb,
                 get_time(sid, p, mb) - cost[p],
@@ -104,8 +104,8 @@ def clear_invalid(repeated, stage, pos, offset=-1):
 def clear_invalid_index(repeated, m):
     p = len(repeated)
     index = pattern_size
-    for identifier in ['F', 'f', 'B', 'b']:
-        if identifier in ['F', 'B']:
             _iter = range(p)
         else:
             _iter = range(p - 1, -1, -1)
@@ -115,7 +115,7 @@ def clear_invalid_index(repeated, m):
                     clear_invalid(repeated, i, index - pattern_size, offset=-1)
                     clear_invalid(repeated, i, index + pattern_size * m, offset=1)
                     index += 1
-                    if identifier in ['B', 'b']:
                         w_identifier = {'B': 'W', 'b': 'w'}[identifier]
                         for k in range(pattern_size):
                             if repeated[i][index + k] == w_identifier:
@@ -135,9 +135,9 @@ def process_warmup_without_increasing_peak_mem(schedules, m):
     for sid in range(len(schedules)):
         cur = 0
         for i in range(len(schedules[sid])):
-            if schedules[sid][i] in ('F', 'f'):
                 cur += 1
-            if schedules[sid][i] in ('W', 'w'):
                 cur -= 1
             mem[sid][i] = cur
             peak_mem = max(peak_mem, cur)
@@ -177,16 +177,16 @@ def process_warmup_without_increasing_peak_mem(schedules, m):
             pos += 1
             while schedules[sid][pos] != ' ' and pos < i:
                 pos += 1
-            if schedules[sid][i] in ('B', 'b'):
                 while pos < i and (schedules[sid][pos] != ' ' or schedules[sid][pos + 1] != ' '):
                     pos += 1
             if pos == i:
                 loc[sid][cnt][schedules[sid][i]] = i
                 continue
-            if schedules[sid][i] in ('B', 'b', 'W', 'w'):
                 schedules[sid][pos] = schedules[sid][i]
                 schedules[sid][i] = ' '
-                if schedules[sid][pos] in ('W', 'w'):
                     for j in range(pos, i):
                         mem[sid][j] -= 1
                 loc[sid][cnt][schedules[sid][pos]] = pos
@@ -265,7 +265,7 @@ def schedule(p, m, cost):
     s = schedule_by_pattern(p, m, whole_pattern)
     for sid in range(len(s)):
         for i in range(len(s[sid])):
-            if s[sid][i] in ('W', 'w'):
                 s[sid][i] = ' '
     res = transform_schedule(s, *cost)
     return res

         time = 0
         if (stage, type, mb) in local_prev:
             time = get_time(stage, *local_prev[(stage, type, mb)])
+        if type in "FB"and stage > 0:
             time = max(time, get_time(stage - 1, type, mb) + c)
+        if type in "fb" and stage + 1< len(schedule):
             time = max(time, get_time(stage + 1, type, mb) + c)
         time_map[(stage, type, mb)] = time + cost[type]
         return time_map[(stage, type, mb)]
         for p, mb in stage:
             result_stage.append(ScheduledNode(
                 p.upper(),
+                p in "fBW",
                 sid,
                 mb,
                 get_time(sid, p, mb) - cost[p],
 def clear_invalid_index(repeated, m):
     p = len(repeated)
     index = pattern_size
+    for identifier in "FfBb":
+        if identifier in "FB":
             _iter = range(p)
         else:
             _iter = range(p - 1, -1, -1)
                     clear_invalid(repeated, i, index - pattern_size, offset=-1)
                     clear_invalid(repeated, i, index + pattern_size * m, offset=1)
                     index += 1
+                    if identifier in "Bb":
                         w_identifier = {'B': 'W', 'b': 'w'}[identifier]
                         for k in range(pattern_size):
                             if repeated[i][index + k] == w_identifier:
     for sid in range(len(schedules)):
         cur = 0
         for i in range(len(schedules[sid])):
+            if schedules[sid][i] in "Ff":
                 cur += 1
+            if schedules[sid][i] in "Ww":
                 cur -= 1
             mem[sid][i] = cur
             peak_mem = max(peak_mem, cur)
             pos += 1
             while schedules[sid][pos] != ' ' and pos < i:
                 pos += 1
+            if schedules[sid][i] in "Bb":
                 while pos < i and (schedules[sid][pos] != ' ' or schedules[sid][pos + 1] != ' '):
                     pos += 1
             if pos == i:
                 loc[sid][cnt][schedules[sid][i]] = i
                 continue
+            if schedules[sid][i] in "BbWw":
                 schedules[sid][pos] = schedules[sid][i]
                 schedules[sid][i] = ' '
+                if schedules[sid][pos] in "Ww":
                     for j in range(pos, i):
                         mem[sid][j] -= 1
                 loc[sid][cnt][schedules[sid][pos]] = pos
     s = schedule_by_pattern(p, m, whole_pattern)
     for sid in range(len(s)):
         for i in range(len(s[sid])):
+            if s[sid][i] in "Ww":
                 s[sid][i] = ' '
     res = transform_schedule(s, *cost)
     return res