From 15345da5cc85e24c6cd93dcb9b245fdcfc430d4e Mon Sep 17 00:00:00 2001
From: Milad Ebrahimipour <milad.ebrahimipour@rapidsilicon.com>
Date: Thu, 31 Aug 2023 00:19:07 -0400
Subject: [PATCH 1/3] Making Analytical Placer Timing-driven and faster

---
 vpr/src/place/analytic_placer.cpp | 456 +++++++++++++++++++++++++++---
 vpr/src/place/analytic_placer.h   |  22 +-
 2 files changed, 432 insertions(+), 46 deletions(-)
diff --git a/vpr/src/place/analytic_placer.cpp b/vpr/src/place/analytic_placer.cpp
index 4752756bba3..569e21485a0 100644
--- a/vpr/src/place/analytic_placer.cpp
+++ b/vpr/src/place/analytic_placer.cpp
@@ -1,20 +1,128 @@
 #ifdef ENABLE_ANALYTIC_PLACE
 
-#    include "analytic_placer.h"
-#    include <Eigen/Core>
-#    include <Eigen/IterativeLinearSolvers>
-#    include <iostream>
-#    include <vector>
-#    include <stdint.h>
-
-#    include "vpr_types.h"
-#    include "vtr_time.h"
-#    include "read_place.h"
-#    include "globals.h"
-#    include "vtr_log.h"
-#    include "cut_spreader.h"
-#    include "vpr_utils.h"
-#    include "place_util.h"
+#include "analytic_placer.h"
+#include <Eigen/Core>
+#include <Eigen/IterativeLinearSolvers>
+#include <iostream>
+#include <vector>
+#include <stdint.h>
+#include <fstream>
+
+#include "vpr_types.h"
+#include "vtr_time.h"
+#include "read_place.h"
+#include "globals.h"
+#include "vtr_log.h"
+#include "cut_spreader.h"
+#include "vpr_utils.h"
+#include "place_util.h"
+
+#include "placer_globals.h"
+#include "place_delay_model.h"
+#include "place_timing_update.h"
+#include "PlacementDelayCalculator.h"
+#include "VprTimingGraphResolver.h"
+#include "timing_util.h"
+#include "timing_info.h"
+#include "timing_place.h"
+#include "tatum/echo_writer.hpp"
+#include "tatum/TimingReporter.hpp"
+#include "concrete_timing_info.h"
+
+
+static void analytical_update_td_delta_costs(const PlaceDelayModel* delay_model,
+                                  const PlacerCriticalities& criticalities,
+                                  const ClusterNetId net,
+                                  const ClusterPinId pin) {
+    constexpr float INVALID_DELAY = std::numeric_limits<float>::quiet_NaN();
+    constexpr float INVALID_COST = std::numeric_limits<double>::quiet_NaN();
+
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+
+    auto& connection_delay = g_placer_ctx.mutable_timing().connection_delay;
+    auto& connection_timing_cost = g_placer_ctx.mutable_timing().connection_timing_cost;
+    auto& proposed_connection_delay = g_placer_ctx.mutable_timing().proposed_connection_delay;
+    auto& proposed_connection_timing_cost = g_placer_ctx.mutable_timing().proposed_connection_timing_cost;
+
+    if (cluster_ctx.clb_nlist.pin_type(pin) == PinType::DRIVER) {
+        /* This pin is a net driver on a moved block. */
+        /* Recompute all point to point connection delays for the net sinks. */
+        for (size_t ipin = 1; ipin < cluster_ctx.clb_nlist.net_pins(net).size();
+             ipin++) {
+            float temp_delay = comp_td_single_connection_delay(delay_model, net,
+                                                               ipin);
+            // std::cout << "temp delay: " << temp_delay << std::endl;
+            /* If the delay hasn't changed, do not mark this pin as affected */
+            // if (temp_delay == connection_delay[net][ipin]) {
+            //     continue;
+            // }
+
+            /* Calculate proposed delay and cost values */
+            proposed_connection_delay[net][ipin] = temp_delay;
+            proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay;
+
+            connection_delay[net][ipin] = proposed_connection_delay[net][ipin];
+            proposed_connection_delay[net][ipin] = INVALID_DELAY;
+            connection_timing_cost[net][ipin] = proposed_connection_timing_cost[net][ipin];
+            proposed_connection_timing_cost[net][ipin] = INVALID_DELAY;
+        }
+    } else {
+        /* This pin is a net sink on a moved block */
+        VTR_ASSERT_SAFE(cluster_ctx.clb_nlist.pin_type(pin) == PinType::SINK);
+
+        /* Check if this sink's net is driven by a moved block */
+        /* Get the sink pin index in the net */
+        int ipin = cluster_ctx.clb_nlist.pin_net_index(pin);
+
+        float temp_delay = comp_td_single_connection_delay(delay_model, net,
+                                                            ipin);
+        /* If the delay hasn't changed, do not mark this pin as affected */
+        // if (temp_delay == connection_delay[net][ipin]) {
+        //     return;
+        // }
+
+        /* Calculate proposed delay and cost values */
+        proposed_connection_delay[net][ipin] = temp_delay;
+        proposed_connection_timing_cost[net][ipin] = criticalities.criticality(net, ipin) * temp_delay;
+
+        connection_delay[net][ipin] = proposed_connection_delay[net][ipin];
+        proposed_connection_delay[net][ipin] = INVALID_DELAY;
+        connection_timing_cost[net][ipin] = proposed_connection_timing_cost[net][ipin];
+        proposed_connection_timing_cost[net][ipin] = INVALID_DELAY;
+
+        /* Record this connection in blocks_affected.affected_pins */
+        // blocks_affected.affected_pins.push_back(pin);
+    }
+}
+
+
+static void analytical_update_timing(const PlaceDelayModel* delay_model, 
+                                     const PlacerCriticalities* criticalities,
+                                     SetupTimingInfo* timing_info,
+                                     NetPinTimingInvalidator* pin_timing_invalidator)
+{
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    // for (int iblk = 0; iblk < blocks_affected.num_moved_blocks; iblk++) {
+    //     ClusterBlockId blk = blocks_affected.moved_blocks[iblk].block_num;
+    for(auto blk: cluster_ctx.clb_nlist.blocks()){
+
+        /* Go through all the pins in the moved block. */
+        for(ClusterPinId blk_pin : cluster_ctx.clb_nlist.block_pins(blk)) {
+            ClusterNetId net_id = cluster_ctx.clb_nlist.pin_net(blk_pin);
+            VTR_ASSERT_SAFE_MSG(net_id,
+                                "Only valid nets should be found in compressed netlist block pins");
+            pin_timing_invalidator->invalidate_connection(blk_pin, timing_info);
+
+            if (cluster_ctx.clb_nlist.net_is_ignored(net_id))
+                //TODO: Do we require anyting special here for global nets?
+                //"Global nets are assumed to span the whole chip, and do not effect costs."
+                continue;
+
+            analytical_update_td_delta_costs(delay_model, *criticalities, net_id, blk_pin);
+        }
+    }
+}
+
 
 // Templated struct for constructing and solving matrix equations in analytic placer
 template<typename T>
@@ -96,9 +204,10 @@ struct EquationSystem {
         for (int i_row = 0; i_row < int(rhs.size()); i_row++)
             vec_rhs[i_row] = rhs.at(i_row);
 
-        ConjugateGradient<SparseMatrix<T>, Lower | Upper> solver;
+        // LeastSquaresConjugateGradient<SparseMatrix<T>> solver;
+        ConjugateGradient<SparseMatrix<T>, Lower | Upper, IdentityPreconditioner> solver;
         solver.setTolerance(tolerance);
-        VectorXd x_res = solver.compute(mat).solveWithGuess(vec_rhs, vec_x_guess);
+        VectorXd x_res = solver.compute(mat).solve(vec_rhs);//solveWithGuess(vec_rhs, vec_x_guess);
         for (int i_row = 0; i_row < int(x.size()); i_row++)
             x.at(i_row) = x_res[i_row];
     }
@@ -138,13 +247,13 @@ AnalyticPlacer::AnalyticPlacer() {
                         // current location and its anchor is formed with strength (alph * iter)
                         // @see build_equations()
 
-    ap_cfg.beta = 1; // utilization factor, <= 1, used to determine if a cut-spreading region is
+    ap_cfg.beta = 0.9; // utilization factor, <= 1, used to determine if a cut-spreading region is
                      // overutilized with the formula: bool overutilized = (num_blks / num_tiles) > beta
                      // for beta < 1, a region must have more tiles than logical blks to not be overutilized
 
     ap_cfg.solverTolerance = 1e-5; // solver parameter, refers to residual error from solver, defined as |Ax-b|/|b|
 
-    ap_cfg.buildSolveIter = 5; // number of build-solve iteration when calculating placement, used in
+    ap_cfg.buildSolveIter = 6; // number of build-solve iteration when calculating placement, used in
                                // build_solve_direction()
                                // for each build-solve iteration, the solution from previous build-solve iteration
                                // is used as a guess for the iterative solver. therefore more buildSolveIter should
@@ -157,8 +266,8 @@ AnalyticPlacer::AnalyticPlacer() {
 
     // following two timing parameters are used to add timing weights in matrix equation, currently not used
     // see comment in add_pin_to_pin_connection() for usage
-    ap_cfg.criticalityExponent = 1;
-    ap_cfg.timingWeight = 10;
+    ap_cfg.criticalityExponent = 2;
+    ap_cfg.timingWeight = 1;
 }
 
 /*
@@ -170,7 +279,11 @@ AnalyticPlacer::AnalyticPlacer() {
  *
  * The final legal placement is passed back to annealer in g_vpr_ctx.mutable_placement()
  */
-void AnalyticPlacer::ap_place() {
+void AnalyticPlacer::ap_place(const Netlist<>& net_list, 
+                              std::unique_ptr<PlaceDelayModel>& place_delay_model,
+                              const t_placer_opts& placer_opts, 
+                              const t_analysis_opts& analysis_opts,
+                              bool is_flat) {
     const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
 
     vtr::ScopedStartFinishTimer timer("Analytic Placement");
@@ -198,12 +311,107 @@ void AnalyticPlacer::ap_place() {
     // setup and solve matrix multiple times for all logic block types before main loop
     // this helps eliminating randomness from initial placement (when placing one block type, the random placement
     // of the other types may have residual effect on the result, since not all blocks are solved at the same time)
-    for (int i = 0; i < 1; i++) { // can tune number of iterations
+    for (int i = 0; i < 3; i++) { // can tune number of iterations
         for (auto run : ap_runs) {
             build_solve_type(run, -1);
         }
     }
 
+
+
+    std::ofstream outfile;
+
+    /*
+        * Initialize timing analysis
+    */
+    // For placement, we don't use flat-routing
+    std::shared_ptr<PlacementDelayCalculator> placement_delay_calc;
+    std::shared_ptr<SetupTimingInfo> timing_info;
+    std::unique_ptr<PlacerSetupSlacks> placer_setup_slacks;
+    std::unique_ptr<PlacerCriticalities> placer_criticalities;
+    std::unique_ptr<NetPinTimingInvalidator> pin_timing_invalidator;
+    auto& device_ctx = g_vpr_ctx.device();
+    auto& atom_ctx = g_vpr_ctx.atom();
+    auto& cluster_ctx = g_vpr_ctx.clustering();
+    const auto& p_timing_ctx = g_placer_ctx.timing();
+    auto& timing_ctx = g_vpr_ctx.timing();
+    t_placer_costs costs(placer_opts.place_algorithm);
+    tatum::TimingPathInfo critical_path;
+
+    comp_td_connection_delays(place_delay_model.get());
+    placement_delay_calc = std::make_shared<PlacementDelayCalculator>(atom_ctx.nlist,
+                                                                      atom_ctx.lookup,
+                                                                      p_timing_ctx.connection_delay,
+                                                                      is_flat);
+    placement_delay_calc->set_tsu_margin_relative(
+        placer_opts.tsu_rel_margin);
+    placement_delay_calc->set_tsu_margin_absolute(
+        placer_opts.tsu_abs_margin);
+
+    timing_info = make_setup_timing_info(placement_delay_calc,
+                                            placer_opts.timing_update_type);
+
+    /* Allocated here because it goes into timing critical code where each memory allocation is expensive */
+    IntraLbPbPinLookup pb_gpin_lookup(device_ctx.logical_block_types);
+    //Enables fast look-up of atom pins connect to CLB pins
+    ClusteredPinAtomPinsLookup netlist_pin_lookup(cluster_ctx.clb_nlist,
+                                                  atom_ctx.nlist, pb_gpin_lookup);
+                                            
+    placer_setup_slacks = std::make_unique<PlacerSetupSlacks>(
+        cluster_ctx.clb_nlist, netlist_pin_lookup);
+
+    placer_criticalities = std::make_unique<PlacerCriticalities>(
+        cluster_ctx.clb_nlist, netlist_pin_lookup);
+    
+    pin_timing_invalidator = make_net_pin_timing_invalidator(e_timing_update_type::FULL, net_list,
+                                                             netlist_pin_lookup, atom_ctx.nlist, 
+                                                             atom_ctx.lookup,*timing_info->timing_graph(),
+                                                             is_flat);
+
+    // pin_timing_invalidator = std::make_unique<NoopNetPinTimingInvalidator>(
+    //     net_list, netlist_pin_lookup,
+    //     atom_ctx.nlist, atom_ctx.lookup,
+    //     *timing_info->timing_graph(),
+    //     is_flat);
+
+    //First time compute timing and costs, compute from scratch
+    PlaceCritParams crit_params;
+    float first_crit_exponent = placer_opts.td_place_exp_first;
+    crit_params.crit_exponent = first_crit_exponent;
+    crit_params.crit_limit = placer_opts.place_crit_limit;
+
+    initialize_timing_info(crit_params, place_delay_model.get(),
+                            placer_criticalities.get(), placer_setup_slacks.get(),
+                            pin_timing_invalidator.get(), timing_info.get(), &costs);
+
+    critical_path = timing_info->least_slack_critical_path();
+
+    VTR_LOG(
+            "Analytical placement estimated initial Critical Path Delay (CPD): %g ns\n",
+            1e9 * critical_path.delay());
+        VTR_LOG(
+            "Analytical placement estimated initial setup Total Negative Slack (sTNS): %g ns\n",
+            1e9 * timing_info->setup_total_negative_slack());
+        VTR_LOG(
+            "Analytical placement estimated initial setup Worst Negative Slack (sWNS): %g ns\n",
+            1e9 * timing_info->setup_worst_negative_slack());
+        VTR_LOG("\n");
+
+        VTR_LOG("Analytical placement estimated initial setup slack histogram:\n");
+        print_histogram(
+            create_setup_slack_histogram(*timing_info->setup_analyzer()));
+
+
+
+
+
+
+
+
+
+
+
+
     int iter = 0, stalled = 0;
     // variables for stats
     int solved_hpwl = 0, spread_hpwl = 0, legal_hpwl = 0, best_hpwl = std::numeric_limits<int>::max();
@@ -211,9 +419,11 @@ void AnalyticPlacer::ap_place() {
 
     print_AP_status_header();
 
+    int loop_iter = 0;
     // main loop for AP
     // stopping criteria: stop after HEAP_STALLED_ITERATIONS_STOP iterations of no improvement
     while (stalled < HEAP_STALLED_ITERATIONS_STOP) {
+        loop_iter++;
         // TODO: investigate better stopping criteria
         iter_start = timer.elapsed_sec();
         for (auto blk_type : ap_runs) { // for each type of logic blocks
@@ -221,7 +431,7 @@ void AnalyticPlacer::ap_place() {
 
             // lower bound placement for blk_type
             // build and solve matrix equation for blocks of type "blk_type" in both x and y directions
-            build_solve_type(blk_type, iter);
+            build_solve_type(blk_type, iter, placer_criticalities.get());
             solve_t = timer.elapsed_sec() - run_start;
             solved_hpwl = total_hpwl();
             // lower bound placement complete
@@ -259,6 +469,20 @@ void AnalyticPlacer::ap_place() {
         }
 
         // TODO: update timing info here after timing weights are implemented in build_equations()
+        // placer_criticalities->recompute_criticalities();
+        // placer_setup_slacks->recompute_setup_slacks();
+        crit_params.crit_exponent *= loop_iter;
+        analytical_update_timing(place_delay_model.get(), placer_criticalities.get(), timing_info.get(), pin_timing_invalidator.get());
+        perform_full_timing_update(crit_params, place_delay_model.get(), placer_criticalities.get(),
+                                    placer_setup_slacks.get(), pin_timing_invalidator.get(), timing_info.get(), &costs);
+        // for(auto blk_id: g_vpr_ctx.clustering().clb_nlist.blocks()){
+        //     for(auto pin_id: g_vpr_ctx.clustering().clb_nlist.block_pins(blk_id)){
+        //         auto net_id = g_vpr_ctx.clustering().clb_nlist.pin_net(pin_id);
+        //         int pin_index = g_vpr_ctx.clustering().clb_nlist.pin_net_index(pin_id);
+        //         std::cout << "pin id: " << (size_t)pin_id << " criticality: " 
+        //                   << placer_criticalities->criticality(net_id, pin_index) << std::endl;
+        //     }
+        // }
 
         if (legal_hpwl < best_hpwl) {
             best_hpwl = legal_hpwl;
@@ -272,13 +496,24 @@ void AnalyticPlacer::ap_place() {
             bl.legal_loc = bl.loc;
         }
         iter_t = timer.elapsed_sec() - iter_start;
-        print_iter_stats(iter, iter_t, timer.elapsed_sec(), best_hpwl, stalled);
+        critical_path = timing_info->least_slack_critical_path();
+        print_iter_stats(iter, iter_t, timer.elapsed_sec(), best_hpwl, stalled, critical_path.delay());
         ++iter;
     }
 }
 
 // build matrix equations and solve for block type "run" in both x and y directions
 // macro member positions are updated after solving
+void AnalyticPlacer::build_solve_type(t_logical_block_type_ptr run, int iter, PlacerCriticalities *place_crit) {
+    setup_solve_blks(run);
+    // build and solve matrix equation for both x, y
+    // passing -1 as iter to build_solve_direction() signals build_equation() not to add pseudo-connections
+    build_solve_direction(false, (iter == 0) ? -1 : iter, ap_cfg.buildSolveIter, place_crit);
+    build_solve_direction(true, (iter == 0) ? -1 : iter, ap_cfg.buildSolveIter, place_crit);
+    update_macros(); // update macro member locations, since only macro head is solved
+}
+
+
 void AnalyticPlacer::build_solve_type(t_logical_block_type_ptr run, int iter) {
     setup_solve_blks(run);
     // build and solve matrix equation for both x, y
@@ -447,10 +682,19 @@ void AnalyticPlacer::update_macros() {
  * More build_solve_iter means better result, with runtime tradeoff. This parameter can be
  * tuned for better performance.
  */
+void AnalyticPlacer::build_solve_direction(bool yaxis, int iter, int build_solve_iter, PlacerCriticalities *place_crit) {
+    for (int i = 0; i < build_solve_iter; i++) {
+        EquationSystem<double> esx(solve_blks.size(), solve_blks.size());
+        build_equations(esx, yaxis, place_crit, iter=iter);
+        solve_equations(esx, yaxis);
+    }
+}
+
+
 void AnalyticPlacer::build_solve_direction(bool yaxis, int iter, int build_solve_iter) {
     for (int i = 0; i < build_solve_iter; i++) {
         EquationSystem<double> esx(solve_blks.size(), solve_blks.size());
-        build_equations(esx, yaxis, iter);
+        build_equations(esx, yaxis, iter=iter);
         solve_equations(esx, yaxis);
     }
 }
@@ -555,7 +799,10 @@ void AnalyticPlacer::add_pin_to_pin_connection(EquationSystem<double>& es,
                                                bool dir,
                                                int num_pins,
                                                ClusterPinId bound_pin,
-                                               ClusterPinId this_pin) {
+                                               ClusterPinId this_pin,
+                                               ClusterNetId net_id,
+                                               int ipin,
+                                               PlacerCriticalities *place_crit) {
     const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
 
     if (this_pin == bound_pin)
@@ -567,6 +814,7 @@ void AnalyticPlacer::add_pin_to_pin_connection(EquationSystem<double>& es,
     // however, in order to do so, need place_sync_external_block_connections(blk_id) for all blocks
     // TODO: map logical pin to physical pin and add this offset for more accurate pin location
     ClusterBlockId this_blk = clb_nlist.pin_block(this_pin);
+    VTR_ASSERT(this_blk);
     int this_pos = dir ? blk_locs[this_blk].loc.y : blk_locs[this_blk].loc.x;
     ClusterBlockId bound_blk = clb_nlist.pin_block(bound_pin);
     int bound_pos = dir ? blk_locs[bound_blk].loc.y : blk_locs[bound_blk].loc.x;
@@ -575,12 +823,54 @@ void AnalyticPlacer::add_pin_to_pin_connection(EquationSystem<double>& es,
     // This ensures that the objective function target HPWL, rather than quadratic wirelength.
     double weight = 1.0 / ((num_pins - 1) * std::max<double>(1, std::abs(bound_pos - this_pos)));
 
+    // std::cout << "Before Weight: " << weight << "   ";
+
     /*
-     * TODO: adding timing weights to matrix entries
-     *if (this_pin != 0){
-     * weight *= (1.0 + tmpCfg.timingWeight * std::pow(place_crit.criticality(net_id, this_pin), tmgCfg.criticalityExponent));
-     * }
-     */
+     * TODO: adding timing weights to matrix entries 
+    */
+    ClusterNetId clb_net = g_vpr_ctx.clustering().clb_nlist.pin_net(this_pin);
+    VTR_ASSERT(clb_net);
+    int pin_index_in_net = g_vpr_ctx.clustering().clb_nlist.pin_net_index(this_pin);
+    if (this_pin != ClusterPinId::INVALID()){
+        weight *= (1.0 + ap_cfg.timingWeight * std::pow(place_crit->criticality(net_id, ipin), ap_cfg.criticalityExponent));
+    }
+    // std::cout << "pin id: " << static_cast<size_t>(this_pin) << " net id: " << static_cast<size_t>(clb_net) << "   " << static_cast<size_t>(net_id) 
+    //           << " pin index: " << pin_index_in_net << "After Weight: " << weight << std::endl;
+    // std::cout << "Criticality1: " << place_crit->criticality(clb_net, ipin) << "  ";
+    // std::cout << "Criticality2: " << place_crit->criticality(clb_net, pin_index_in_net) << std::endl;
+     
+
+    stamp_weight_on_matrix(es, dir, this_blk, this_blk, weight);
+    stamp_weight_on_matrix(es, dir, this_blk, bound_blk, -weight);
+    stamp_weight_on_matrix(es, dir, bound_blk, bound_blk, weight);
+    stamp_weight_on_matrix(es, dir, bound_blk, this_blk, -weight);
+}
+
+
+void AnalyticPlacer::add_pin_to_pin_connection(EquationSystem<double>& es,
+                                               bool dir,
+                                               int num_pins,
+                                               ClusterPinId bound_pin,
+                                               ClusterPinId this_pin) {
+    const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
+
+    if (this_pin == bound_pin)
+        // no connection if 2 pins are the same
+        return;
+
+    // this_blk and bound_blk locations may not be accurate for larger tiles spanning multiple grid locations
+    // need block_locs[blk_id].loc.x + physical_tile_type(bnum)->pin_width_offset[pnum]
+    // however, in order to do so, need place_sync_external_block_connections(blk_id) for all blocks
+    // TODO: map logical pin to physical pin and add this offset for more accurate pin location
+    ClusterBlockId this_blk = clb_nlist.pin_block(this_pin);
+    int this_pos = dir ? blk_locs[this_blk].loc.y : blk_locs[this_blk].loc.x;
+    ClusterBlockId bound_blk = clb_nlist.pin_block(bound_pin);
+    int bound_pos = dir ? blk_locs[bound_blk].loc.y : blk_locs[bound_blk].loc.x;
+    // implementing the bound-to-bound net model detailed in HeAP paper, where each bound blk has (num_pins - 1) connections
+    // (bound_pos - this_pos) in the denominator "linearizes" the quadratic term (bound_pos - this_pos)^2 in the objective function
+    // This ensures that the objective function target HPWL, rather than quadratic wirelength.
+    double weight = 1.0 / ((num_pins - 1) * std::max<double>(1, std::abs(bound_pos - this_pos)));
+     
 
     stamp_weight_on_matrix(es, dir, this_blk, this_blk, weight);
     stamp_weight_on_matrix(es, dir, this_blk, bound_blk, -weight);
@@ -589,7 +879,7 @@ void AnalyticPlacer::add_pin_to_pin_connection(EquationSystem<double>& es,
 }
 
 // Build the system of equations for either X or Y
-void AnalyticPlacer::build_equations(EquationSystem<double>& es, bool yaxis, int iter) {
+void AnalyticPlacer::build_equations(EquationSystem<double>& es, bool yaxis, PlacerCriticalities *place_crit, int iter) {
     const ClusteredNetlist& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
 
     // Return the x or y position of a block
@@ -598,6 +888,83 @@ void AnalyticPlacer::build_equations(EquationSystem<double>& es, bool yaxis, int
     auto legal_p = [&](ClusterBlockId blk_id) { return yaxis ? blk_locs[blk_id].legal_loc.y : blk_locs[blk_id].legal_loc.x; };
     es.reset();
 
+    /*
+     * Bound2bound model is used in HeAP:
+     * For each net, the left-most and right-most (or down, up in y direction) are bound blocks
+     * These 2 blocks form connections with each other and all the other blocks (internal blocks)
+     * These connections are used to formulate the matrix equation
+     */
+    for (auto net_id : clb_nlist.nets()) {
+        if (clb_nlist.net_is_ignored(net_id)
+            || clb_nlist.net_driver(net_id) == ClusterPinId::INVALID()
+            || clb_nlist.net_sinks(net_id).empty()) {
+            // ensure net is not ignored (ex. clk nets), has valid driver, has at least 1 sink
+            continue;
+        }
+
+        // find the 2 bound pins (min and max pin)
+        ClusterPinId min_pin = ClusterPinId::INVALID(), max_pin = ClusterPinId::INVALID();
+        int min_pos = std::numeric_limits<int>::max(), max_pos = std::numeric_limits<int>::min();
+        for (auto pin_id : clb_nlist.net_pins(net_id)) {
+            int pos = blk_p(clb_nlist.pin_block(pin_id));
+            if (pos < min_pos) {
+                min_pos = pos;
+                min_pin = pin_id;
+            }
+            if (pos > max_pos) {
+                max_pos = pos;
+                max_pin = pin_id;
+            }
+        }
+        VTR_ASSERT(min_pin != ClusterPinId::INVALID());
+        VTR_ASSERT(max_pin != ClusterPinId::INVALID());
+
+        int num_pins = clb_nlist.net_pins(net_id).size();
+        for (int ipin = 0; ipin < num_pins; ipin++) {
+            ClusterPinId pin_id = clb_nlist.net_pin(net_id, ipin);
+            // for each pin in net, connect to 2 bound pins (bound2bound model)
+            add_pin_to_pin_connection(es, yaxis, num_pins, min_pin, pin_id, net_id, ipin, place_crit);
+            if (pin_id != min_pin)
+                // avoid adding min_pin to max_pin connection twice
+                add_pin_to_pin_connection(es, yaxis, num_pins, max_pin, pin_id, net_id, ipin, place_crit);
+        }
+    }
+       
+    // Add pseudo-connections to anchor points (legalized position for each block) after first iteration
+    // These pseudo-connections pull blocks towards their legal locations, which tends to reduce overlaps in the placement,
+    // also so that the next iteration of build-solving matrix doesn't destroy the placement from last iteration.
+    // As weight increases with number of iterations, solver's solution converges with the legal placement.
+    if (iter != -1) { // if not the first AP iteration
+        for (size_t row = 0; row < solve_blks.size(); row++) {
+            int l_pos = legal_p(solve_blks.at(row));        // legalized position from last iteration (anchors)
+            int solver_blk_pos = blk_p(solve_blks.at(row)); // matrix solved block position from last iteration
+
+            // weight increases with iteration --> psudo-connection strength increases to force convergence to legal placement
+            // weight is also higher for blocks that haven't moved much from their solver location to their legal location
+            double weight = ap_cfg.alpha * iter / std::max<double>(1, std::abs(l_pos - solver_blk_pos));
+
+            // Adding coefficient to Matrix[row][row] and adding weight to rhs vector is equivalent to adding connection
+            // to an immovable block at legal position.
+            // The equation becomes Weight * (blk_pos - legal_pos) = 0, where blk_pos is the variable to solve in rhs[row],
+            // legal_pos is a constant
+            // see comment for add_pin_to_pin_connection() -> special_case: immovable/fixed block
+            es.add_coeff(row, row, weight);
+            es.add_rhs(row, weight * l_pos);
+        }
+    }
+}
+
+
+// Build the system of equations for either X or Y
+void AnalyticPlacer::build_equations(EquationSystem<double>& es, bool yaxis, int iter) {
+    const auto& clb_nlist = g_vpr_ctx.clustering().clb_nlist;
+
+    // Return the x or y position of a block
+    auto blk_p = [&](ClusterBlockId blk_id) { return yaxis ? blk_locs[blk_id].loc.y : blk_locs[blk_id].loc.x; };
+    // Return legal position from legalization, after first iteration
+    auto legal_p = [&](ClusterBlockId blk_id) { return yaxis ? blk_locs[blk_id].legal_loc.y : blk_locs[blk_id].legal_loc.x; };
+    es.reset();
+
     /*
      * Bound2bound model is used in HeAP:
      * For each net, the left-most and right-most (or down, up in y direction) are bound blocks
@@ -796,11 +1163,11 @@ void AnalyticPlacer::print_place(const char* place_file) {
 
 void AnalyticPlacer::print_AP_status_header() {
     VTR_LOG("\n");
-    VTR_LOG("---- ------ ------ -------- ------- | ------ --------- ------ ------ ------ ------ -------- -------- --------\n");
-    VTR_LOG("Iter   Time   Iter     Best   Stall |    Run BlockType  Solve  Solve Spread  Legal   Solved   Spread    Legal\n");
-    VTR_LOG("              Time     hpwl         |   Time            Block   Time   Time   Time     hpwl     hpwl     hpwl\n");
-    VTR_LOG("      (sec)  (sec)                  |  (sec)              Num  (sec)  (sec)  (sec)                           \n");
-    VTR_LOG("---- ------ ------ -------- ------- | ------ --------- ------ ------ ------ ------ -------- -------- --------\n");
+    VTR_LOG("---- ------ ------ -------- ------- ----- | ------ --------- ------ ------ ------ ------ -------- -------- --------\n");
+    VTR_LOG("Iter   Time   Iter     Best   Stall  CPD  |    Run BlockType  Solve  Solve Spread  Legal   Solved   Spread    Legal\n");
+    VTR_LOG("              Time     hpwl               |   Time            Block   Time   Time   Time     hpwl     hpwl     hpwl\n");
+    VTR_LOG("      (sec)  (sec)                   (ns) |  (sec)              Num  (sec)  (sec)  (sec)                           \n");
+    VTR_LOG("---- ------ ------ -------- ------- ----- | ------ --------- ------ ------ ------ ------ -------- -------- --------\n");
 }
 
 void AnalyticPlacer::print_run_stats(const int iter,
@@ -817,7 +1184,7 @@ void AnalyticPlacer::print_run_stats(const int iter,
     VTR_LOG(
         "%4zu "
         "%6.3f "
-        "                        | "
+        "                              | "
         "%6.3f "
         "%9s "
         "%6d "
@@ -844,19 +1211,22 @@ void AnalyticPlacer::print_iter_stats(const int iter,
                                       const float iterTime,
                                       const float time,
                                       const int bestHPWL,
-                                      const int stall) {
+                                      const int stall,
+                                      const float cpd) {
     VTR_LOG(
         "%4zu "
         "%6.3f "
         "%6.3f "
         "%8d "
-        "%7d |\n",
+        "%7d"
+        "%7.3f |\n",
         iter,
         time,
         iterTime,
         bestHPWL,
-        stall);
-    VTR_LOG("                                    |\n");
+        stall,
+        cpd * 1e9);
+    VTR_LOG("                                          |\n");
 }
 
 // sentinel for blks not solved in current iteration
diff --git a/vpr/src/place/analytic_placer.h b/vpr/src/place/analytic_placer.h
index e31775adf7e..b6a98696dd1 100644
--- a/vpr/src/place/analytic_placer.h
+++ b/vpr/src/place/analytic_placer.h
@@ -133,7 +133,11 @@ class AnalyticPlacer {
      *
      * The final legal placement is passed back to annealer in g_vpr_ctx.mutable_placement()
      */
-    void ap_place();
+    void ap_place(const Netlist<>& net_list, std::unique_ptr<PlaceDelayModel>& place_delay_model, const t_placer_opts& placer_opts, const t_analysis_opts& analysis_opts, bool is_flat);
+  private:  
+    float UPPER_RLIM;
+    float FINAL_RLIM = 1.;
+    float INVERSE_DELTA_RLIM;
 
   private:
     // for CutSpreader to access placement info from solver (legal_pos, block_locs, etc).
@@ -218,6 +222,7 @@ class AnalyticPlacer {
     // build matrix equations and solve for block type "run" in both x and y directions
     // macro member positions are updated after solving
     // iter is used to determine pseudo-connection strength
+    void build_solve_type(t_logical_block_type_ptr run, int iter, PlacerCriticalities *place_crit);
     void build_solve_type(t_logical_block_type_ptr run, int iter);
 
     /*
@@ -252,6 +257,7 @@ class AnalyticPlacer {
      * tuned for better performance)
      * the solution from the previous build-solve iteration is used as a guess for the iterative solver
      */
+    void build_solve_direction(bool yaxis, int iter, int build_solve_iter, PlacerCriticalities *place_crit);
     void build_solve_direction(bool yaxis, int iter, int build_solve_iter);
 
     /*
@@ -273,17 +279,26 @@ class AnalyticPlacer {
      * Calculate weight for connection and stamp them into appropriate position in matrix by invoking
      * stamp_weight_on_matrix() multiple times. For more detail, see comments in implementation.
      */
+    void add_pin_to_pin_connection(EquationSystem<double>& es,
+                                   bool dir,
+                                   int num_pins,
+                                   ClusterPinId bound_pin,
+                                   ClusterPinId this_pin,
+                                   ClusterNetId netID,
+                                   int ipin,
+                                   PlacerCriticalities *place_crit);
     void add_pin_to_pin_connection(EquationSystem<double>& es,
                                    bool dir,
                                    int num_pins,
                                    ClusterPinId bound_pin,
                                    ClusterPinId this_pin);
-
+           
     /*
      * Build the system of equations for either X or Y
      * When iter != -1, for each block, psudo-conenction to its prior legal location is formed,
      * the strength is determined by ap_cfg.alpha and iter
      */
+    void build_equations(EquationSystem<double>& es, bool yaxis, PlacerCriticalities *place_crit, int iter = -1);
     void build_equations(EquationSystem<double>& es, bool yaxis, int iter = -1);
 
     /*
@@ -327,7 +342,8 @@ class AnalyticPlacer {
                           const float iterTime,
                           const float time,
                           const int bestHPWL,
-                          const int stall);
+                          const int stall,
+                          const float cpd);
 };
 
 #endif /* ENABLE_ANALYTIC_PLACE */

From 982ef3393d418d880d11c39db626f53fb43899b2 Mon Sep 17 00:00:00 2001
From: Milad Ebrahimipour <milad.ebrahimipour@rapidsilicon.com>
Date: Thu, 31 Aug 2023 00:24:37 -0400
Subject: [PATCH 2/3] Cascade Placer: Enables using Analytical Placer as
 initial Placer

---
 vpr/src/base/SetupVPR.cpp     | 2 ++
 vpr/src/base/read_options.cpp | 8 ++++++++
 vpr/src/base/read_options.h   | 2 ++
 vpr/src/base/vpr_types.h      | 2 ++
 vpr/src/place/place.cpp       | 8 +++++---
 5 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/vpr/src/base/SetupVPR.cpp b/vpr/src/base/SetupVPR.cpp
index efef48ed4c1..1bb0a8cdc74 100644
--- a/vpr/src/base/SetupVPR.cpp
+++ b/vpr/src/base/SetupVPR.cpp
@@ -670,6 +670,8 @@ static void SetupPlacerOpts(const t_options& Options, t_placer_opts* PlacerOpts)
     PlacerOpts->effort_scaling = Options.place_effort_scaling;
     PlacerOpts->timing_update_type = Options.timing_update_type;
     PlacerOpts->enable_analytic_placer = Options.enable_analytic_placer;
+    // Cascade Placer
+    PlacerOpts->enable_cascade_placer = Options.enable_cascade_placer;
     PlacerOpts->place_static_move_prob = Options.place_static_move_prob;
     PlacerOpts->place_static_notiming_move_prob = Options.place_static_notiming_move_prob;
     PlacerOpts->place_high_fanout_net = Options.place_high_fanout_net;
diff --git a/vpr/src/base/read_options.cpp b/vpr/src/base/read_options.cpp
index ad935c44faa..d79231c7081 100644
--- a/vpr/src/base/read_options.cpp
+++ b/vpr/src/base/read_options.cpp
@@ -1979,6 +1979,14 @@ argparse::ArgumentParser create_arg_parser(std::string prog_name, t_options& arg
             "Once analytic placement is done, the result is passed through the quench phase of the annealing placer for local improvement")
         .default_value("false")
         .show_in(argparse::ShowIn::HELP_ONLY);
+    
+    // Cascade Placer
+    place_grp.add_argument(args.enable_cascade_placer, "--enable_cascade_placer")
+        .help(
+            "Enables the cascade placer. "
+            "Once analytic placement is done, the result is passed through the annealing (SA) placer")
+        .default_value("false")
+        .show_in(argparse::ShowIn::HELP_ONLY);
 
     place_grp.add_argument(args.place_static_move_prob, "--place_static_move_prob")
         .help(
diff --git a/vpr/src/base/read_options.h b/vpr/src/base/read_options.h
index 97645367680..4f098041e69 100644
--- a/vpr/src/base/read_options.h
+++ b/vpr/src/base/read_options.h
@@ -124,6 +124,8 @@ struct t_options {
     argparse::ArgValue<e_place_effort_scaling> place_effort_scaling;
     argparse::ArgValue<e_place_delta_delay_algorithm> place_delta_delay_matrix_calculation_method;
     argparse::ArgValue<bool> enable_analytic_placer;
+    //Cascade_placer
+    argparse::ArgValue<bool> enable_cascade_placer;
     argparse::ArgValue<std::vector<float>> place_static_move_prob;
     argparse::ArgValue<std::vector<float>> place_static_notiming_move_prob;
     argparse::ArgValue<int> place_high_fanout_net;
diff --git a/vpr/src/base/vpr_types.h b/vpr/src/base/vpr_types.h
index 7b98cc2c0e0..c820bbf7a25 100644
--- a/vpr/src/base/vpr_types.h
+++ b/vpr/src/base/vpr_types.h
@@ -1216,6 +1216,8 @@ struct t_placer_opts {
      * of the annealing placer for local improvement
      */
     bool enable_analytic_placer;
+    // Cascade Placer
+    bool enable_cascade_placer;
 };
 
 /* All the parameters controlling the router's operation are in this        *
diff --git a/vpr/src/place/place.cpp b/vpr/src/place/place.cpp
index 1b9a6508010..475d296ab79 100644
--- a/vpr/src/place/place.cpp
+++ b/vpr/src/place/place.cpp
@@ -552,8 +552,8 @@ void try_place(const Netlist<>& net_list,
      *  both the clb_netlist and the gird.
      *  Most of anneal is disabled later by setting initial temperature to 0 and only further optimizes in quench
      */
-    if (placer_opts.enable_analytic_placer) {
-        AnalyticPlacer{}.ap_place();
+    if (placer_opts.enable_analytic_placer || placer_opts.enable_cascade_placer) {
+        AnalyticPlacer{}.ap_place(net_list, place_delay_model, placer_opts, analysis_opts, is_flat);
     }
 
 #endif /* ENABLE_ANALYTIC_PLACE */
@@ -802,7 +802,9 @@ void try_place(const Netlist<>& net_list,
 #ifdef ENABLE_ANALYTIC_PLACE
     // Analytic placer: When enabled, skip most of the annealing and go straight to quench
     // TODO: refactor goto label.
-    if (placer_opts.enable_analytic_placer)
+    if ((placer_opts.enable_cascade_placer) || (!placer_opts.enable_analytic_placer))
+        skip_anneal = false;
+    else
         skip_anneal = true;
 #endif /* ENABLE_ANALYTIC_PLACE */
 

From b66f415c1f4ad5697f3d7551870ef66db844458e Mon Sep 17 00:00:00 2001
From: Milad Ebrahimipour <milad.ebrahimipour@rapidsilicon.com>
Date: Thu, 31 Aug 2023 00:28:01 -0400
Subject: [PATCH 3/3] Using VPR's default configs for Analytical Placer

---
 vpr/src/place/analytic_placer.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vpr/src/place/analytic_placer.cpp b/vpr/src/place/analytic_placer.cpp
index 569e21485a0..c3b6bb0ca2c 100644
--- a/vpr/src/place/analytic_placer.cpp
+++ b/vpr/src/place/analytic_placer.cpp
@@ -207,7 +207,7 @@ struct EquationSystem {
         // LeastSquaresConjugateGradient<SparseMatrix<T>> solver;
         ConjugateGradient<SparseMatrix<T>, Lower | Upper, IdentityPreconditioner> solver;
         solver.setTolerance(tolerance);
-        VectorXd x_res = solver.compute(mat).solve(vec_rhs);//solveWithGuess(vec_rhs, vec_x_guess);
+        VectorXd x_res = solver.compute(mat).solveWithGuess(vec_rhs, vec_x_guess);
         for (int i_row = 0; i_row < int(x.size()); i_row++)
             x.at(i_row) = x_res[i_row];
     }
@@ -247,13 +247,13 @@ AnalyticPlacer::AnalyticPlacer() {
                         // current location and its anchor is formed with strength (alph * iter)
                         // @see build_equations()
 
-    ap_cfg.beta = 0.9; // utilization factor, <= 1, used to determine if a cut-spreading region is
+    ap_cfg.beta = 1.0; // utilization factor, <= 1, used to determine if a cut-spreading region is
                      // overutilized with the formula: bool overutilized = (num_blks / num_tiles) > beta
                      // for beta < 1, a region must have more tiles than logical blks to not be overutilized
 
     ap_cfg.solverTolerance = 1e-5; // solver parameter, refers to residual error from solver, defined as |Ax-b|/|b|
 
-    ap_cfg.buildSolveIter = 6; // number of build-solve iteration when calculating placement, used in
+    ap_cfg.buildSolveIter = 5; // number of build-solve iteration when calculating placement, used in
                                // build_solve_direction()
                                // for each build-solve iteration, the solution from previous build-solve iteration
                                // is used as a guess for the iterative solver. therefore more buildSolveIter should
@@ -266,8 +266,8 @@ AnalyticPlacer::AnalyticPlacer() {
 
     // following two timing parameters are used to add timing weights in matrix equation, currently not used
     // see comment in add_pin_to_pin_connection() for usage
-    ap_cfg.criticalityExponent = 2;
-    ap_cfg.timingWeight = 1;
+    ap_cfg.criticalityExponent = 1;
+    ap_cfg.timingWeight = 10;
 }
 
 /*